In [51]:
import os
from tqdm.notebook import tqdm
import sys
import logging
import matplotlib.pyplot as plt

import mlflow

import torch
from torchvision import datasets, transforms

from sagemaker.session import Session

In [10]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [18]:
mlflow.set_tracking_uri("arn:aws:sagemaker:ap-southeast-2:954690186719:mlflow-tracking-server/SageMaker-Experiment-Lab")

In [11]:
experiment_name = "pytorch-mlflow-experiment-lab"
run_name = "pytorch-exp-1"

In [12]:
datasets.MNIST.urls = [
    f"https://sagemaker-example-files-prod-{Session().boto_region_name}.s3.amazonaws.com/datasets/image/MNIST/train-images-idx3-ubyte.gz",
    f"https://sagemaker-example-files-prod-{Session().boto_region_name}.s3.amazonaws.com/datasets/image/MNIST/train-labels-idx1-ubyte.gz",
    f"https://sagemaker-example-files-prod-{Session().boto_region_name}.s3.amazonaws.com/datasets/image/MNIST/t10k-images-idx3-ubyte.gz",
    f"https://sagemaker-example-files-prod-{Session().boto_region_name}.s3.amazonaws.com/datasets/image/MNIST/t10k-labels-idx1-ubyte.gz"
]

train_set = datasets.MNIST(
    "mnist_data",
    train=True,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081))
        ]
    ),
    download=True
)

test_set = datasets.MNIST(
    "mnist_data",
    train=False,
    transform=transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081))
        ]
    ),
    download=True
)

In [38]:
class MnistClassifier(torch.nn.Module):
    def __init__(self, hidden_channels, kernel_size, drop_out):
        super(MnistClassifier, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, hidden_channels, kernel_size = kernel_size)
        self.conv2 = torch.nn.Conv2d(hidden_channels, 20, kernel_size = kernel_size)
        self.conv2_drop = torch.nn.Dropout2d(p=drop_out)
        self.fc1 = torch.nn.Linear(320, 50)
        self.fc2 = torch.nn.Linear(50, 10)

    def forward(self, x):
        x = torch.nn.functional.relu(torch.nn.functional.max_pool2d(self.conv1(x), 2))
        x = torch.nn.functional.relu(
            torch.nn.functional.max_pool2d(self.conv2_drop(self.conv2(x)), 2)
        )
        x = x.view(-1, 320)
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.dropout(x, training=self.training)
        output = torch.nn.functional.log_softmax(self.fc2(x), dim=1)

        return output
        

In [58]:
def log_performance(model, data_loader, device, epoch, metric_type="Test"):
    model.eval()
    loss = 0
    correct = 0

    with torch.no_grad():
        for data, target in tqdm(data_loader, total=len(data_loader), desc=f"Log {metric_type} Performance (epoch {epoch}) "):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss += torch.nn.functional.nll_loss(
                output, target, reduction="sum"
            ).item()

            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()
        loss /= len(data_loader.dataset)
        accuracy = 100.0 * correct / len(data_loader.dataset)

        mlflow.log_metric(f"{metric_type}_loss", loss, step=epoch)
        mlflow.log_metric(f"{metric_type}_accuracy", accuracy, step=epoch)

In [59]:
def train_model(
    train_set, test_set, param_set, experiment_name, run_name, data_dir="mnist_data", optimizer="sgd"
):
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run(run_name=run_name) as run:
        mlflow.pytorch.autolog()

        mlflow.log_param('BatchSize', param_set['batch_size'])
        mlflow.log_param('HiddenChannels', param_set['hidden_channels'])
        mlflow.log_param('KernelSize', param_set['kernel_size'])
        mlflow.log_param('Dropout', param_set['dropout'])
        mlflow.log_param('Momentum', param_set['momentum'])
        mlflow.log_param('LearningRate', param_set['lr'])
        mlflow.log_param('Epochs', param_set['epochs'])
        mlflow.log_param('Optimizer', optimizer)
        
        device = torch.device("cpu")
    
        torch.manual_seed(42)
    
        train_loader = torch.utils.data.DataLoader(train_set, batch_size=param_set['batch_size'], shuffle=True)
        test_loader = torch.utils.data.DataLoader(test_set, batch_size=1000, shuffle=True)
    
        model = MnistClassifier(param_set['hidden_channels'], kernel_size=param_set['kernel_size'], drop_out=param_set['dropout']).to(device)
        model = torch.nn.DataParallel(model)
    
        log_interval = 100
    
        if optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=param_set['lr'], momentum=param_set['momentum'])
        else : optimizer = torch.optim.Adam(model.parameters(), lr=param_set['lr'])
    
        for epoch in range(1, param_set['epochs']+1):
            model.train()
            for batch_idx, (data, target) in tqdm(enumerate(train_loader, start=1), desc=f"Training Epoch > {epoch} ", total=len(train_loader)):
                data, target = data.to(device), target.to(device)
                
                optimizer.zero_grad()
                
                output = model(data)
                
                loss = torch.nn.functional.nll_loss(output, target)
                
                loss.backward()
                
                optimizer.step()

            log_performance(model, train_loader, device, epoch, "Train")
            log_performance(model, test_loader, device, epoch, "Test")

        with torch.no_grad():
            for data, target in tqdm(test_loader, desc=f"Testing"):
                data, target = data.to(device), target.to(device)
                output = model(data)
                pred = output.max(1, keepdim=True)[1]

In [60]:
PARAM_SET = {
    'batch_size' : 64,
    'hidden_channels' : 10,
    'kernel_size' : 5,
    'dropout' : 0.5,
    'lr' : 0.01,
    'momentum' : 0.1,
    'epochs' : 10,
}

In [None]:
train_model(
    train_set = train_set,
    test_set = test_set,
    param_set = PARAM_SET,
    experiment_name = experiment_name,
    run_name = run_name
)

Training Epoch > 1 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 1) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 1) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 2 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 2) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 2) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 3 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 3) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 3) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 4 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 4) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 4) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 5 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 5) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 5) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 6 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 6) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 6) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 7 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 7) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 7) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 8 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 8) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 8) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 9 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 9) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 9) :   0%|          | 0/10 [00:00<?, ?it/s]

Training Epoch > 10 :   0%|          | 0/938 [00:00<?, ?it/s]

Log Train Performance (epoch 10) :   0%|          | 0/938 [00:00<?, ?it/s]

Log Test Performance (epoch 10) :   0%|          | 0/10 [00:00<?, ?it/s]

Testing:   0%|          | 0/10 [00:00<?, ?it/s]