## Table of Contents
* Setup
    * Libraries
    * Constants

    * Functions
    * Configurations
* Data Exploration
    * Model 1
    * Evaluation Model 1
    * Re-train Model 1
    * Evaluation Model 1.1
* Optimizer Experiment
* Kernel-size & Epoch Experiment
* Model Comparison
    * Evaluation of best Model

## Libraries

In [75]:
"""
<
!pip install -r requirements.txt
!pip list
>
""";

import functools
import os
import time
import warnings

from datetime import datetime
from typing import Dict, List, Tuple, Union

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import torch
import torch.optim as optim
import torchvision

from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    accuracy_score, 
    f1_score, )
from torch import nn, optim
from torch.autograd import Variable

## Constants

In [76]:
DATA_DIR: str = './data'
TRAIN_DATA_DIR: str = './data/train_data'
TEST_DATA_DIR: str = './data/test_data'

# Default learning rate and batch size
LR: float = 1e-4
MINI_BATCH_SIZE: int = 128

CLASSES: Dict[int, str] = {
    0: 'T-shirt/top',
    1: 'Trouser',
    2: 'Pullover',
    3: 'Dress',
    4: 'Coat',
    5: 'Sandal',
    6: 'Shirt',
    7: 'Sneaker',
    8: 'Bag',
    9: 'Ankle boot',
}

# Search space is e.g., (4) kernel size: (8, 16, 32, 64) epochs
SEARCH_SPACE: Dict[int, List[int]] = {
    4: [8, 16, 32, 64],
    6: [8, 16, 32, 64],
    8: [8, 16, 32, 64],
}

## Functions

In [77]:
def check_dir_and_check_data(_data_dir=DATA_DIR):
    """
    Checks if a data directory exists and if it is populated.
    Download of data starts automatically if directory does 
    not exists or exists and is empty.
    """
    if os.path.exists(_data_dir) and len(os.listdir(_data_dir)) == 0:
        print("Data directory exists.", end='\n\n')
        TRAIN_DATA = torchvision.datasets.FashionMNIST(
            root=TRAIN_DATA_DIR,
            train=True,
            transform=torchvision.transforms.Compose(
                [torchvision.transforms.ToTensor()]),
            download=True,
        )
        TEST_DATA = torchvision.datasets.FashionMNIST(
            root=TEST_DATA_DIR,
            train=False,
            transform=torchvision.transforms.Compose(
                [torchvision.transforms.ToTensor()]),
            download=True,
        )
        return TRAIN_DATA, TEST_DATA
    
    elif os.path.exists(_data_dir) and len(os.listdir(_data_dir)) != 0:
        print("Data directory exists and is already populated.",)
        TRAIN_DATA = torchvision.datasets.FashionMNIST(
            root=TRAIN_DATA_DIR,
            train=True,
            transform=torchvision.transforms.Compose(
                [torchvision.transforms.ToTensor()]),
            download=False,
        )
        TEST_DATA = torchvision.datasets.FashionMNIST(
            root=TEST_DATA_DIR,
            train=False,
            transform=torchvision.transforms.Compose(
                [torchvision.transforms.ToTensor()]),
            download=False,
        )
        return TRAIN_DATA, TEST_DATA
        
    else:
        print(f"Create '{_data_dir}' directory.", end='\n\n')
        os.makedirs(_data_dir)

        print("Data directory exists.", end='\n\n')
        TRAIN_DATA = torchvision.datasets.FashionMNIST(
            root=TRAIN_DATA_DIR,
            train=True,
            transform=torchvision.transforms.Compose(
                [torchvision.transforms.ToTensor()]),
            download=True,
        )
        TEST_DATA = torchvision.datasets.FashionMNIST(
            root=TEST_DATA_DIR,
            train=False,
            transform=torchvision.transforms.Compose(
                [torchvision.transforms.ToTensor()]),
            download=True,
        )
        return TRAIN_DATA, TEST_DATA

In [78]:
def visualize_one_image(
    data, image_id: int, 
    size: Tuple[int, int] = (500, 500)
):
    """ Visualize one image at a time given the image ID.
    """
    _img, _label = data[image_id]
    fig = px.imshow(
        torchvision.transforms.ToPILImage()(_img),
        title=f'Example: {image_id} <br>Label     : { CLASSES[_label]}',
        color_continuous_scale='RdBu_r', 
        origin='upper',
        height=size[0],
        width=size[1],
    )
    return fig

In [79]:
def timer(func):
    """ Wrapper to keep track of elapsed time of experiments.
    """
    @functools.wraps(func)
    def _timer(*args, **kwargs):
        _start = time.perf_counter()
        value = func(*args, **kwargs)
        _end = time.perf_counter()
        _elapsed_time = _end - _start
        print(f"Elapsed time: {_elapsed_time:.4f} seconds.")
        return value, _elapsed_time
    return _timer

In [80]:
def weight_reset(m) -> None:
    """ Reset weigths before each experiment!
    """
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        m.reset_parameters()

In [81]:
def evaluate_model(eval_model):
    """ Routine to evaluate the FashionMnist models.
    """
    fashion_mnist_eval_dataloader = torch.utils.data.DataLoader(
        TEST_DATA, batch_size=10000, shuffle=False, )

    eval_mini_batch_losses = []

    for _, (images, labels) in enumerate(fashion_mnist_eval_dataloader):

        output = eval_model(images)

        loss = nn.NLLLoss()(output, labels)

        eval_mini_batch_losses.append(loss.data.item())

    y_true = TEST_DATA.targets
    y_pred = torch.argmax(
        eval_model(iter(fashion_mnist_eval_dataloader).next()[0]), dim=1, )
    
    return (
        np.mean(eval_mini_batch_losses), 
        accuracy_score(
            y_true,
            y_pred,
        ),
        classification_report(
            y_true, 
            y_pred,
        ),
        confusion_matrix(
            y_true, 
            y_pred,
        ),
        f1_score(
            y_true, 
            y_pred,
            average='weighted',
        ),
    )

In [82]:
def model_experiments_matrix(
    metric_matrix,
    title: str = 'Model Experiment',
    xlabel: str = "Class Accuracies",
    ylabel: str = "Model",
    colorlabel: str = "Hits",
):
    """ Plots the confusion matrix of an experiment.
    """
    fig = px.imshow(
        metric_matrix,
        title=title,
        template='none',
        labels=dict(
            x=xlabel, 
            y=ylabel, 
            color=colorlabel,
        ),
        x=[*CLASSES.values()],
        y=[*CLASSES.values()],
        aspect='equal',
        color_continuous_scale='RdBu',
        zmin=0, 
        zmax=1000,
    )
    return fig

## Configurations

In [83]:
print(torch.__version__)
print(np.__version__)

# Library settings
os.environ['TZ'] = 'Europe/London'
warnings.filterwarnings('ignore')

seed = 7
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type
torch.cuda.manual_seed(seed)
print(f'Notebook with {str(device)} computation enabled.', end='\n\n')

# Download data
TRAIN_DATA, TEST_DATA = check_dir_and_check_data(DATA_DIR)

# Sanity Check
assert len(TRAIN_DATA) == 60000
assert len(TEST_DATA) == 10000

# Experiments
print(f"Number of experiments: {len(SEARCH_SPACE.keys()) * len(SEARCH_SPACE[4])}")

# Plotly
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

1.10.1+cpu
1.21.4
Notebook with cpu computation enabled.

Data directory exists and is already populated.
Number of experiments: 12


## Data Exploration

In [84]:
image_id = np.random.randint(len(TRAIN_DATA), size=1)[0]
visualize_one_image(TRAIN_DATA, image_id, size=(400, 500)).show()

In [85]:
print(type(TRAIN_DATA))
print(type(TEST_DATA))
print(TRAIN_DATA)
print(TEST_DATA)

<class 'torchvision.datasets.mnist.FashionMNIST'>
<class 'torchvision.datasets.mnist.FashionMNIST'>
Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: ./data/train_data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
           )
Dataset FashionMNIST
    Number of datapoints: 10000
    Root location: ./data/test_data
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
           )


## Model 1
Model 1 is the first implementation. Its aim is to be slightly better than the baseline. That is an accuracy greater than 1/10 = 0.1. The class FashionMnist is set up to be used throughout the notebook. The experiments later will focus on the kernel size and the epochs. Hence the kernel size and padding are parmeters.

In [86]:
class FashionMnist(nn.Module):
    """ Defined model for this notebook.
    """
    
    def __init__(
        self, 
        kernel_size: int, 
        padding: int,
    ):
        """
        :param kernel_size: Variable kernel size for different CNNs.
        :param padding: Padding respective to the kernel size.
        """
        super(FashionMnist, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=1, out_channels=6, kernel_size=kernel_size, 
            stride=1, padding=padding, )
        self.pool1 = nn.MaxPool2d(
            kernel_size=2, stride=2, padding=0, )
        self.conv2 = nn.Conv2d(
            in_channels=6, out_channels=16, kernel_size=kernel_size, 
            stride=1, padding=padding, )
        self.pool2 = nn.MaxPool2d(
            kernel_size=2, stride=2, padding=0, )
        
        self.linear1 = nn.Linear(16 * 4 * 4, 120, bias=True, )
        self.relu1 = nn.ReLU(inplace=True, )
        self.linear2 = nn.Linear(120, 84, bias=True, )
        self.relu2 = nn.ReLU(inplace=True,)
        self.linear3 = nn.Linear(84, 10, )
        self.logsoftmax = nn.LogSoftmax(dim=1, )
    
    def forward(self, images):
        x = self.conv1(images)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        
        x = x.view(-1, 16 * 4 * 4)
        x = self.relu1(self.linear1(x))
        x = self.relu2(self.linear2(x))
        x = self.logsoftmax(self.linear3(x))
        return x

In [87]:
@timer
def train_model_1(
    num_epochs: int, 
    kernel_size: int = 5, 
    padding: int = 0,
    mini_batch_size = MINI_BATCH_SIZE,
    lr=LR,
    criterion=nn.NLLLoss(),
):  
    """
    Function to execute the training of model_1.
    All parameters are fixed, except for the epochs.
    """
    # Inputs fixed for first model
    _model_1 = FashionMnist(kernel_size, padding)
    _model_1.apply(weight_reset)
    _model_1 = _model_1.to(device)
    _model_1.train()
    
    optimizer = optim.SGD(params=_model_1.parameters(), lr=LR, )
    criterion.to(device)
    
    fashion_mnist_train_dataloader = torch.utils.data.DataLoader(
    TRAIN_DATA, batch_size=MINI_BATCH_SIZE, shuffle=True, )

    train_epoch_losses_model_1 = []

    for epoch in range(num_epochs):

        train_mini_batch_losses = []

        for _, (images, labels) in enumerate(fashion_mnist_train_dataloader):
            images = images.to(device)
            labels = labels.to(device)

            output = _model_1(images)
            _model_1.zero_grad()

            loss = criterion(output, labels)
            loss.backward()

            optimizer.step()

            train_mini_batch_losses.append(loss.data.item())

        train_epoch_loss = np.mean(train_mini_batch_losses)
        train_epoch_losses_model_1.append(train_epoch_loss)

        print(f'Epoch {epoch}: {train_epoch_loss:.5}')
    return _model_1, train_epoch_losses_model_1

In [88]:
# Train model 1 and catch the model aswell as the losses
model_1, model_1_losses = train_model_1(num_epochs=10)

Epoch 0: 2.3045
Epoch 1: 2.304
Epoch 2: 2.3035
Epoch 3: 2.303
Epoch 4: 2.3026
Epoch 5: 2.3021
Epoch 6: 2.3016
Epoch 7: 2.3011
Epoch 8: 2.3007
Epoch 9: 2.3002
Elapsed time: 158.1950 seconds.


## Evaluation Model 1

In [89]:
fig = px.line(
    model_1[1],
    title='Model_1 Loss',
    labels={
        'index': 'Training Epoch',
        'value': 'Loss',
        'variable': 'Model',
    },
    template='none',
    log_y=False,
)
fig.show()

"""
Interestingly the loss has a linear scope. I guess the plot is so much 
zoomed in (only 10 epochs) that the slope appears linear. The conclusion
is that more epochs are needed and this model is a true 'vanilla'.
""";

In [90]:
model_1_loss, model_1_acc, model_1_report, model_1_cm, model_1_f1 = evaluate_model(model_1[0])

In [91]:
print(f"Loss: {model_1_loss}")
print(f"Accuracy: {model_1_acc}")
print(f"F1-score: {model_1_f1}")
print("Classiification report:")
print(model_1_report)

"""
The accuracy confirms the previous conclusion the model barely started 
to learn. The accuracy is just above 0.1. The classification report
confirms that aswell. Interestingly the model started to learn one class
first.
""";

Loss: 2.2998476028442383
Accuracy: 0.1005
F1-score: 0.027027049140613258
Classiification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1000
           1       0.00      0.00      0.00      1000
           2       0.12      0.90      0.20      1000
           3       0.05      0.10      0.07      1000
           4       0.00      0.00      0.00      1000
           5       0.00      0.00      0.00      1000
           6       0.00      0.00      0.00      1000
           7       0.00      0.00      0.00      1000
           8       0.00      0.00      0.00      1000
           9       0.00      0.00      0.00      1000

    accuracy                           0.10     10000
   macro avg       0.02      0.10      0.03     10000
weighted avg       0.02      0.10      0.03     10000



In [92]:
model_experiments_matrix(
    model_1_cm, title='Model_1 Experiment', 
    xlabel='True Label', 
    ylabel='Predicted Label',
    colorlabel='Hits', )

"""
The confusion matrix mirrors the classification report. One class is learned
first.
""";

## Re-Train Model 1
Model 1 showed that one class appears to be learned first. To test this hypothesis I retrain the model with more epochs and expect more classes to be learned.

In [93]:
model_1_1, model_1_1_losses = train_model_1(num_epochs=50)

Epoch 0: 2.307
Epoch 1: 2.3065
Epoch 2: 2.3061
Epoch 3: 2.3057
Epoch 4: 2.3053
Epoch 5: 2.3048
Epoch 6: 2.3044
Epoch 7: 2.304
Epoch 8: 2.3036
Epoch 9: 2.3032
Epoch 10: 2.3028
Epoch 11: 2.3024
Epoch 12: 2.3019
Epoch 13: 2.3015
Epoch 14: 2.3011


In [None]:
model_1_1_loss, model_1_1_acc, model_1_1_report, model_1_1_cm, model_1_1_f1 = evaluate_model(model_1_1[0])

## Evaluation Model 1.1

In [None]:
print(f"Loss: {model_1_1_loss}")
print(f"Accuracy: {model_1_1_acc}")
print(f"F1-score: {model_1_1_f1}")
print("Classiification report:")
print(model_1_1_report)

Loss: 2.2787866592407227
Accuracy: 0.1131
F1-score: 0.042484792396233095
Classiification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1000
           1       0.00      0.00      0.00      1000
           2       0.00      0.00      0.00      1000
           3       0.00      0.00      0.00      1000
           4       0.05      0.10      0.07      1000
           5       0.88      0.08      0.14      1000
           6       0.00      0.00      0.00      1000
           7       0.00      0.00      0.00      1000
           8       0.12      0.95      0.22      1000
           9       0.00      0.00      0.00      1000

    accuracy                           0.11     10000
   macro avg       0.10      0.11      0.04     10000
weighted avg       0.10      0.11      0.04     10000



In [None]:
model_experiments_matrix(
    model_1_1_cm, 
    title='Model_1 Experiment [2]', 
    xlabel='True Label', 
    ylabel='Predicted Label', 
    colorlabel='Hits', )

"""
The classification report and the confusion matrix provide evidence
to confirm the hypothesis. The classes seem to be learned 'one-by-one'
""";

## Optimizer Experiment

In [None]:
# Find best optimizer
model = FashionMnist(5, 0)
model = model.to(device)

# Data
fashion_mnist_train_dataloader = torch.utils.data.DataLoader(
    TRAIN_DATA, batch_size=MINI_BATCH_SIZE, shuffle=True, )

optimizers = [
    torch.optim.Adadelta(model.parameters(), lr=LR,),
    torch.optim.Adagrad(model.parameters(), lr=LR,),
    torch.optim.Adam(model.parameters(), lr=LR,),
    torch.optim.RMSprop(model.parameters(), lr=LR,),
    torch.optim.SGD(model.parameters(), lr=LR,),  
]

# Hint from lecture, that nll is better?
criteria = [
    torch.nn.NLLLoss(),
]

In [None]:
"""
Train 5 models with constant inputs except for the optimizer.
The losses are saved after each epoch and model.
""";

num_epochs = 20
mini_batch_size = MINI_BATCH_SIZE

overall_losses = []
train_epoch_losses = []

model.train()

for optimizer in optimizers:
    for criterion in criteria:

        # reset model
        model.apply(weight_reset)

        train_epoch_losses = []
        print(f"Optimizer: {optimizer}\nCriterion: {criterion}\n")
        criterion.to(device)
        #optimizer.to(device)

        for epoch in range(num_epochs):

            train_mini_batch_losses = []
            
            for _, (images, labels) in enumerate(fashion_mnist_train_dataloader):
                images = images.to(device)
                labels = labels.to(device)

                output = model(images)
                model.zero_grad()
                
                loss = criterion(output, labels)
                loss.backward()
                
                optimizer.step()
                
                train_mini_batch_losses.append(loss.data.item())
            
            train_epoch_loss = np.mean(train_mini_batch_losses)
            print('Epoch: {} train-loss: {}'.format(str(epoch), str(train_epoch_loss)))
            
            train_epoch_losses.append(train_epoch_loss)
        overall_losses.append(train_epoch_losses)
        print(overall_losses)
        print("-" * 90)

Optimizer: Adadelta (
Parameter Group 0
    eps: 1e-06
    lr: 0.0001
    rho: 0.9
    weight_decay: 0
)
Criterion: NLLLoss()

Epoch: 0 train-loss: 2.304752920228027
Epoch: 1 train-loss: 2.304639879320222
Epoch: 2 train-loss: 2.3045193213643804
Epoch: 3 train-loss: 2.3044079300691322
Epoch: 4 train-loss: 2.3042914572555118
Epoch: 5 train-loss: 2.304179283093288
Epoch: 6 train-loss: 2.3040664638283412
Epoch: 7 train-loss: 2.3039517316228544
Epoch: 8 train-loss: 2.303842895829093
Epoch: 9 train-loss: 2.3037304319044165
Epoch: 10 train-loss: 2.303625842401468
Epoch: 11 train-loss: 2.303512620519219
Epoch: 12 train-loss: 2.303403151823259
Epoch: 13 train-loss: 2.3032860537327684
Epoch: 14 train-loss: 2.303175117415406
Epoch: 15 train-loss: 2.3030692870174643
Epoch: 16 train-loss: 2.302957877929785
Epoch: 17 train-loss: 2.3028486964545016
Epoch: 18 train-loss: 2.3027388475088677
Epoch: 19 train-loss: 2.302630601915469
[[2.304752920228027, 2.304639879320222, 2.3045193213643804, 2.30440793006

Epoch: 0 train-loss: 2.3021860015926077
Epoch: 1 train-loss: 2.3016213287931007
Epoch: 2 train-loss: 2.3010558980360214
Epoch: 3 train-loss: 2.3004841291065663
Epoch: 4 train-loss: 2.2999145074693885
Epoch: 5 train-loss: 2.2993400813674114
Epoch: 6 train-loss: 2.2987705985111977
Epoch: 7 train-loss: 2.2981988982096917
Epoch: 8 train-loss: 2.297622959751056
Epoch: 9 train-loss: 2.2970440707989592
Epoch: 10 train-loss: 2.296466010465805
Epoch: 11 train-loss: 2.2958798169581365
Epoch: 12 train-loss: 2.2952950071932663
Epoch: 13 train-loss: 2.2947003383880484
Epoch: 14 train-loss: 2.2941039116906206
Epoch: 15 train-loss: 2.2935039956432415
Epoch: 16 train-loss: 2.292888142660991
Epoch: 17 train-loss: 2.292272166148432
Epoch: 18 train-loss: 2.2916409781238416
Epoch: 19 train-loss: 2.2910027326042974
[[2.304752920228027, 2.304639879320222, 2.3045193213643804, 2.3044079300691322, 2.3042914572555118, 2.304179283093288, 2.3040664638283412, 2.3039517316228544, 2.303842895829093, 2.30373043190441

In [None]:
df_optimizers = pd.DataFrame(
    overall_losses, 
    index=[
        'Adadelta-NLL',
        'Adagrad-NLL',
        'Adam-NLL',
        'RMSprop-NLL',
        'SGD-NLL',
    ],
    columns=['Epoch_' + str(epoch) for epoch in list(range(1, 21, 1))],
).T

In [None]:
"""
The experiment results in a pd.df with 5 columns and 20 epochs each.
""";

df_optimizers

Unnamed: 0,Adadelta-NLL,Adagrad-NLL,Adam-NLL,RMSprop-NLL,SGD-NLL
Epoch_1,2.304753,2.262708,1.313551,1.051242,2.302186
Epoch_2,2.30464,2.18089,0.753662,0.727396,2.301621
Epoch_3,2.304519,2.085981,0.675304,0.670579,2.301056
Epoch_4,2.304408,1.986734,0.628897,0.635512,2.300484
Epoch_5,2.304291,1.88692,0.596436,0.608251,2.299915
Epoch_6,2.304179,1.790677,0.571224,0.585324,2.29934
Epoch_7,2.304066,1.701089,0.550911,0.565577,2.298771
Epoch_8,2.303952,1.620501,0.534411,0.548141,2.298199
Epoch_9,2.303843,1.548931,0.51945,0.532702,2.297623
Epoch_10,2.30373,1.486046,0.505083,0.5204,2.297044


In [None]:
fig = px.line(
    df_optimizers,
    title='Optimizer Experiment',
    labels={
        'index': 'Training Epoch',
        'value': '<b>Log</b> Loss',
        'variable': 'Combination',
    },
    template='none',
    log_y=True,
)
fig.show()

"""
The experiment shows, that SGD unfortunately is the worst optimizer
for this data. That explains the linear slope in Task 1.
Adam and RMSprop do a much better job with RMSprop doing a slightly
better job (this changes after each iteration!). 
""";

## Kernel-Size & Epochs Experiment

In [None]:
@timer
def train_model_2(
    num_epochs: int, 
    kernel_size: int, 
    padding: int,
    mini_batch_size = MINI_BATCH_SIZE,
    lr=LR,
    criterion=nn.NLLLoss(),
):
    overall_losses = []
    train_epoch_losses = []
    
    fashion_mnist_train_dataloader = torch.utils.data.DataLoader(TRAIN_DATA, batch_size=mini_batch_size, shuffle=True)
    
    model = FashionMnist(kernel_size, padding)
    model.train()

    optimizer = optim.RMSprop(params=model.parameters(), lr=lr)
    criterion.to(device)

    # Adjust LR
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.1, patience=3, threshold=0.0001, threshold_mode='abs',)

    for epoch in range(num_epochs):

        train_mini_batch_losses = []

        for _, (images, labels) in enumerate(fashion_mnist_train_dataloader):
            images = images.to(device)
            labels = labels.to(device)
            
            output = model(images)
            model.zero_grad()
            
            loss = criterion(output, labels)
            loss.backward()
            
            # Must be after loss!
            scheduler.step(loss)

            optimizer.step()

            train_mini_batch_losses.append(loss.data.item())

        train_epoch_loss = np.mean(train_mini_batch_losses)
        print(f'Epoch: {epoch} train-loss: {train_epoch_loss}')

        train_epoch_losses.append(train_epoch_loss)
    overall_losses.append(train_epoch_losses)
    print(overall_losses)
    print("-" * 90)
    
    return model, overall_losses

In [None]:
"""
Iterate over the search space (kernel sizes and epochs) and record 
key evaluation metrics after each model. The experiment will result
in 12 models with their metrics.
""";

experiment_losses = []
experiment_model_accs = []
experiment_model_f1s = []

for kernel_size in SEARCH_SPACE.keys():
    for epoch in SEARCH_SPACE[kernel_size]:
        print(kernel_size)
        print(epoch)
        
        # Decide which padding to use
        padding = 0
        if kernel_size == 6:
            padding = 1
        if kernel_size == 8:
            padding = 2
            
        current_model, current_model_losses = train_model_2(epoch, kernel_size, padding)
        _, current_model_acc, _, _, current_model_f1 = evaluate_model(current_model[0])
        
        print(f"Kernel: {kernel_size}x{kernel_size}\n"\
              f"Epoch: {epoch}\n"
              f"Accuracy: {current_model_acc}\n"
              f"F1-score: {current_model_f1}\n\n")
        
        experiment_model_accs.append(round(current_model_acc, 4))
        experiment_model_f1s.append(round(current_model_f1, 4))
        experiment_losses.append(current_model_losses)
        
        print(experiment_model_accs)
        print(experiment_model_f1s)

4
8
Epoch: 0 train-loss: 1.4335088745109055
Epoch: 1 train-loss: 1.330315076466054
Epoch: 2 train-loss: 1.3300310856243696
Epoch: 3 train-loss: 1.3298323375583967
Epoch: 4 train-loss: 1.329515532898242
Epoch: 5 train-loss: 1.3292913106458781
Epoch: 6 train-loss: 1.3290746712735466
Epoch: 7 train-loss: 1.3287921145018229
[[1.4335088745109055, 1.330315076466054, 1.3300310856243696, 1.3298323375583967, 1.329515532898242, 1.3292913106458781, 1.3290746712735466, 1.3287921145018229]]
------------------------------------------------------------------------------------------
Elapsed time: 93.7825 seconds.
Kernel: 4x4
Epoch: 8
Accuracy: 0.631
F1-score: 0.6062822785552365


[0.631]
[0.6063]
4
16
Epoch: 0 train-loss: 1.7227751657144348
Epoch: 1 train-loss: 1.6883789377171856
Epoch: 2 train-loss: 1.6880094374674979
Epoch: 3 train-loss: 1.6876796209481733
Epoch: 4 train-loss: 1.6872972168647913
Epoch: 5 train-loss: 1.6869668752145666
Epoch: 6 train-loss: 1.6866342640126437
Epoch: 7 train-loss: 1.68

Epoch: 1 train-loss: 1.453384769496633
Epoch: 2 train-loss: 1.453031746309195
Epoch: 3 train-loss: 1.4527044695323463
Epoch: 4 train-loss: 1.452350807342448
Epoch: 5 train-loss: 1.4519666115612364
Epoch: 6 train-loss: 1.451645673210941
Epoch: 7 train-loss: 1.4512520151605992
[[1.5339394075784094, 1.453384769496633, 1.453031746309195, 1.4527044695323463, 1.452350807342448, 1.4519666115612364, 1.451645673210941, 1.4512520151605992]]
------------------------------------------------------------------------------------------
Elapsed time: 130.1738 seconds.
Kernel: 6x6
Epoch: 8
Accuracy: 0.5816
F1-score: 0.5266201664962257


[0.631, 0.4491, 0.3148, 0.5873, 0.5816]
[0.6063, 0.369, 0.2021, 0.5254, 0.5266]
6
16
Epoch: 0 train-loss: 1.2739526821352016
Epoch: 1 train-loss: 1.1898243002800037
Epoch: 2 train-loss: 1.1895553168457453
Epoch: 3 train-loss: 1.189162584510185
Epoch: 4 train-loss: 1.18886306367195
Epoch: 5 train-loss: 1.1885134932328898
Epoch: 6 train-loss: 1.188228601840005
Epoch: 7 tra

Epoch: 0 train-loss: 1.3879313654482746
Epoch: 1 train-loss: 1.3336074037084193
Epoch: 2 train-loss: 1.333169617632559
Epoch: 3 train-loss: 1.332697814461519
Epoch: 4 train-loss: 1.3322999614642372
Epoch: 5 train-loss: 1.3319007153195868
Epoch: 6 train-loss: 1.3314541740966503
Epoch: 7 train-loss: 1.3310188011828261
[[1.3879313654482746, 1.3336074037084193, 1.333169617632559, 1.332697814461519, 1.3322999614642372, 1.3319007153195868, 1.3314541740966503, 1.3310188011828261]]
------------------------------------------------------------------------------------------
Elapsed time: 124.1496 seconds.
Kernel: 8x8
Epoch: 8
Accuracy: 0.5985
F1-score: 0.533377082677139


[0.631, 0.4491, 0.3148, 0.5873, 0.5816, 0.66, 0.4613, 0.5785, 0.5985]
[0.6063, 0.369, 0.2021, 0.5254, 0.5266, 0.6315, 0.4086, 0.523, 0.5334]
8
16
Epoch: 0 train-loss: 1.4938130114378452
Epoch: 1 train-loss: 1.444620601403942
Epoch: 2 train-loss: 1.4440846311003923
Epoch: 3 train-loss: 1.4435255901137394
Epoch: 4 train-loss: 1.44

Kernel: 8x8
Epoch: 64
Accuracy: 0.5221
F1-score: 0.4556375509426431


[0.631, 0.4491, 0.3148, 0.5873, 0.5816, 0.66, 0.4613, 0.5785, 0.5985, 0.546, 0.5987, 0.5221]
[0.6063, 0.369, 0.2021, 0.5254, 0.5266, 0.6315, 0.4086, 0.523, 0.5334, 0.4815, 0.5436, 0.4556]


In [None]:
fig = go.Figure(
    data=[
        go.Surface(
            z=np.array(experiment_model_accs).reshape(4,3),
            x=[4, 6, 8, ],
            y=[8, 16, 32, 64,],
        ),
        go.Surface(
            z=np.array(experiment_model_f1s).reshape(4,3),
            x=[4, 6, 8, ],
            y=[8, 16, 32, 64,],
        ),
    ],
)

fig.update_traces(contours_z=dict(
    show=True,
    usecolormap=True,
    highlightcolor="limegreen",
    project_z=True,
))

fig.update_layout(
    title='Accuracy & F1-Score over Kernel Size and no. of Epochs',
    autosize=True,
    width=800,
    height=800,
    margin=dict(
        l=65,
        r=50,
        b=65,
        t=90, ),
)
camera = dict(
    up=dict(x=0, y=0, z=1),  # z-axis up
    center=dict(x=0, y=0, z=0),  # default
    eye=dict(x=1.5, y=1.5, z=0.8)
)

fig.update_layout(
    scene=dict(
        xaxis_title='Kernel size',
        yaxis_title='# Epochs',
        zaxis_title='Test Accuracy',
        xaxis=dict(nticks=3, range=[4, 8],),
        yaxis=dict(nticks=4, range=[8, 64],),
        zaxis=dict(nticks=10, range=[0.1, 0.7],),
    ),
    scene_camera=camera,
)
fig.write_html('./plots/surface_plot.html')
#fig.show()  # comment out if problems with WebGL

< WebGl causes problems with plotting a 3D plot in google colab. >  
The surface plot displays the accuracy and the average f1-scores for each model. We can observe a slight trend upwards with increasing epochs and a dip at kernel size 6x6 across all epochs. I did not know that accuracy and f1-score are that close related, but it makes sense because they assess the same metric (value counts per class) and scale it down between 0 and 1. The upper surface is accuracy and the lower surface are the f1-scores.  
Note that the z-axis is scaled between 0.1 and 0.7.

<img align="center" style="max-width: 600px" src="./plots/surface_plot.png">

In [None]:
fig = px.imshow(
    np.array(experiment_model_accs).reshape(4,3).T,
    title='Model Comparison',
    template='none',
    labels=dict(
        x='# Epochs', 
        y='Kernel Size', 
        color='Accuracy',
    ),
    x=['*8', '*16', '*32', '*64',],
    y=['*4', '*6', '*8', ],
    aspect='equal',
    color_continuous_scale='RdBu',
    zmin=0, 
    zmax=1,
)
fig.show()

"""
The surface plot can be sclaed down to a simple heatmap aswell. 
The color is scaled between 0 and 1. This illustrates that the
models are each closely related. None shows a much much better 
performance.
""";

## Evaluation Model 2

In [None]:
model_2_loss, model_2_acc, model_2_report, model_2_cm, model_2_f1 = evaluate_model(current_model[0])

In [None]:
print(f"Loss: {model_2_loss}")
print(f"Accuracy: {model_2_acc}")
print(f"F1-score: {model_2_f1}")
print("Classiification report:")
print(model_2_report)

Loss: 1.374875545501709
Accuracy: 0.5221
F1-score: 0.4556375509426431
Classiification report:
              precision    recall  f1-score   support

           0       0.58      0.69      0.63      1000
           1       0.62      0.89      0.73      1000
           2       0.42      0.62      0.50      1000
           3       0.53      0.32      0.40      1000
           4       0.41      0.47      0.44      1000
           5       0.48      0.31      0.38      1000
           6       0.12      0.01      0.01      1000
           7       0.95      0.04      0.08      1000
           8       0.54      0.88      0.67      1000
           9       0.56      0.99      0.72      1000

    accuracy                           0.52     10000
   macro avg       0.52      0.52      0.46     10000
weighted avg       0.52      0.52      0.46     10000



In [None]:
fig = px.imshow(
    model_2_cm,
    title='Confusion Matrix',
        labels=dict(
        x="True Label", 
        y="Predicted Label",
        color="Hits",
    ),
    x=[*CLASSES.values()],
    y=[*CLASSES.values()],
    template='none',
    aspect='equal',
    color_continuous_scale='RdBu',
    zmin=0, 
    zmax=1000,
)
fig.show()

"""
The confusion matrix shows an improved 'learning' across classes.
When compared to the vanilla model more classes are predicted correctly.
""";