In [None]:
# default_exp pytorch_mnist_HPO

In [50]:
#export
# from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR



The WMLA framework requires 2 changes to your code to support the HPO API, and these are:

* Inject hyperparameters for the sub-training during search
* Retrieve sub-training result metric


1. Import the dependent libararies:

&nbsp;
&nbsp;
![image1](https://github.com/IBM/wmla-learning-path/raw/dev/shared-images/hpo_update_model_0.png)
&nbsp;
&nbsp;


In [None]:
# HPO - import dependent lib
import json
import os

2. Get the WMLA cluster `DLI_DATA_FS`, `RESULT_DIR` and `LOG_DIR` for the HPO training job. The `DLI_DATA_FS` can be used for shared data placement, the `RESULT_DIR` can be used for final model saving, and the `LOG_DIR` can be used for user logs and monitoring.

&nbsp;
**Note**: `DLI_DATA_FS` is set when installing the DLI cluster; `RESULT_DIR` and `LOG_DIR` is generated by WMLA for each HPO experiment.

&nbsp;
&nbsp;
![image1](https://github.com/IBM/wmla-learning-path/raw/dev/shared-images/hpo_update_model_1.png)
&nbsp;
&nbsp;


In [51]:
#export
# get dataset from DLI_DATA_FS
dataDir=None
try :
    # If invoked by WMLA
    dataDir = os.environ["DLI_DATA_FS"]
    model_path = os.environ["RESULT_DIR"]+"/model/saved_model"
except :
    # For notebook run
    dataDir = "/tmp"
    model_path = "/tmp/model/saved_model"
    
if dataDir is not None:
    print("dataDir is: %s"%dataDir)
else:
    print("Warning: not found DATA_DIR from os env!")

print ("model_path: %s" %model_path)

dataDir is: /tmp
model_path: /tmp/model/saved_model
failed to get hyper-parameters from config.json


3. Replace the hyperparameter definition code by reading hyperparameters from the `config.json` file. the `config.json` is generated by WMLA HPO, which contains a set of hyperparameter candidates for each tuning jobs. The hyperparameters and the search space is defined when submitting the HPO task. For example, here the hyperparameter `learning_rate` is set to tune:

&nbsp;
&nbsp;
![image2](https://github.com/IBM/wmla-learning-path/raw/dev/shared-images/hpo_update_model_2.png)

&nbsp;
Then you could use the hyperparameter you get from `config.json` where you want:
&nbsp;
![image2](https://github.com/IBM/wmla-learning-path/raw/dev/shared-images/hpo_update_model_2_2.png)
&nbsp;
&nbsp;


In [None]:
# HPO - get hpo experiment hyper-parameter values from config.json
# The hyperparameters and the search space is defined when submitting the HPO task
# WMLA HPO will generate hpo experiment candidates and writes to config.json
try:
    hyper_params = json.loads(open("config.json").read())
    print('hyper_params: ', hyper_params)
    learning_rate = float(hyper_params.get("learning_rate", "0.01"))
except:
    print('failed to get hyper-parameters from config.json')
    learning_rate = 0.001
    pass


In [52]:
#export
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout2d(0.25)
        self.dropout2 = nn.Dropout2d(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


In [53]:
#export
def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


In [20]:
#export
test_metrics = []
def test(model, device, test_loader, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    test_metrics.append((epoch, {"loss": float(test_loss)}))


In [None]:
#export
try: from nbdev.imports import IN_NOTEBOOK
except: IN_NOTEBOOK=False
print("Running in notebook" if IN_NOTEBOOK else "Not running in notebook")

4.  Write the tuning result into `val_dict_list.json` under `RESULT_DIR`. WMLA HPO will read this file for each tuning job to get the metric values. Define a `test_metrics` list to store all metric values and pass the epoch parameter to the test function. Then you can add the metric values to the `test_metrics` list during the training test process. Please note that the metric names should be specified when submitting the HPO task, and be consistent with the code here.
&nbsp;
For example, at the HPO task submit request, `loss` will be used as the objective metric the tuning will try to minimize the `loss`:


In [48]:
#export
def main(args):

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(dataDir, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(dataDir, train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.test_batch_size, shuffle=True, **kwargs)

    model = Net().to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader, epoch)
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")


    # HPO - dump metric values to val_dict_list.json start
    training_out =[]
    for test_metric in test_metrics:
        out = {'steps':test_metric[0]}
        for (metric,value) in test_metric[1].items():
            out[metric] = value
        training_out.append(out)
    with open('{}/val_dict_list.json'.format(os.environ['RESULT_DIR']), 'w') as f:
        json.dump(training_out, f)



64

Test set: Average loss: 1.7030, Accuracy: 7311/10000 (73%)




Test set: Average loss: 1.0685, Accuracy: 8176/10000 (82%)


Test set: Average loss: 0.7913, Accuracy: 8426/10000 (84%)



AttributeError: 'Args' object has no attribute 'save_model'

In [56]:
# Code To handle submission from notebook, or as a *.py program after export

import sys
if __name__ == '__main__' and not IN_NOTEBOOK:
    print("Running in python program mode")
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=14, metavar='N',
                        help='number of epochs to train (default: 14)')
    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
                        help='learning rate (default: 1.0)')
    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                        help='Learning rate step gamma (default: 0.7)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')
    args = parser.parse_args()
    
    main(args)
else :
    print("Running in Notebook mode")
    class Args(object):
      def __init__(self):
        self.batch_size = 64
        self.no_cuda = False
        self.seed = 0
        self.test_batch_size = 64
        self.gamma=0.7
        self.epochs= 3
        self.log_interval=10
    args = Args()
    main(args)

64

Test set: Average loss: 1.7030, Accuracy: 7311/10000 (73%)




Test set: Average loss: 1.0685, Accuracy: 8176/10000 (82%)


Test set: Average loss: 0.7913, Accuracy: 8426/10000 (84%)



AttributeError: 'Args' object has no attribute 'save_model'

In [55]:
from nbdev.export import notebook2script
notebook2script()


Converted 00_conductor_l3_lab.ipynb.
Converted 00_pytorch_mnist_HPO.py.ipynb.
Converted 00_sparkpi_example.ipynb.
Converted 01_classify_images_dli.ipynb.
Converted 01_custom_classifier_dli.ipynb.
Converted 02_hpo.ipynb.
Converted 02_hpo_custom_experiment.ipynb.
Converted 03_edt.ipynb.
Converted 04_CPD2.5-WMLA-Python-Client-E2E-flow.ipynb.
Converted 04_edi.ipynb.
Converted 05-wmla-api-submitting-hyperparameter-optimization.ipynb.
Converted 05_cpd2.5-wmla-python-client-e2e-flow.ipynb.
Converted index.ipynb.
