This script can be used to compare the different model architecture through the confusion matrix

### Import modules

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchmetrics.classification import ConfusionMatrix

# Put a defined seed to be able to have "deterministic" results
torch.manual_seed(0)

<torch._C.Generator at 0x7febaff2e590>

### Import and process data

In [None]:
dataset_full = torchvision.datasets.FashionMNIST(
    "data", train=True, download=True, transform=transforms.ToTensor()
)

# Choosing the classes we want
idx_tshirt_trouser = [0, 1]
idx_pullover_dress = [2, 3]
idx_tshirt_trouser_pullover_dress = [0, 1, 2, 3]

# Creating the datasets
dataset_tshirt_trouser = [
    data for data in dataset_full if data[1] in idx_tshirt_trouser
]  # 01
dataset_pullover_dress = [
    data for data in dataset_full if data[1] in idx_pullover_dress
]  # 23
dataset_tshirt_trouser_pullover_dress = [
    data for data in dataset_full if data[1] in idx_tshirt_trouser_pullover_dress
]

# Splits the datasets between train and test
train_dataset_all, test_dataset_all = torch.utils.data.dataset.random_split(
    dataset_full, [50000, 10000]
)
train_dataset_01, test_dataset_01 = torch.utils.data.dataset.random_split(
    dataset_tshirt_trouser, [10000, 2000]
)
train_dataset_23, test_dataset_23 = torch.utils.data.dataset.random_split(
    dataset_pullover_dress, [10000, 2000]
)
train_dataset_0123, test_dataset_0123 = torch.utils.data.dataset.random_split(
    dataset_tshirt_trouser_pullover_dress, [20000, 4000]
)

# Creating PyTorch DataLoader with a batch_size of 8
batch_size = 8
train_loader_all = DataLoader(train_dataset_all, batch_size=batch_size)
test_loader_all = DataLoader(test_dataset_all, batch_size=batch_size)

train_loader_01 = DataLoader(train_dataset_01, batch_size=batch_size)
test_loader_01 = DataLoader(test_dataset_01, batch_size=batch_size)

train_loader_23 = DataLoader(train_dataset_23, batch_size=batch_size)
test_loader_23 = DataLoader(test_dataset_23, batch_size=batch_size)

train_loader_0123 = DataLoader(train_dataset_0123, batch_size=batch_size)
test_loader_0123 = DataLoader(test_dataset_0123, batch_size=batch_size)

### Architecture of the model

In [None]:
# Simple class that creates a CNN network with 
# two convolutional layers and a FCN

class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.fc1 = nn.Linear(in_features=64 * 6 * 6, out_features=600)
        self.fc2 = nn.Linear(in_features=600, out_features=120)
        self.fc3 = nn.Linear(in_features=120, out_features=4)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fc3(out)

        return out

### Compute sum of confusion matrix for the two training methods

In [None]:
versions = ["01_0123", "23_0123"] # from the generic name of the models, to import them

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
avg_confusion = []
n_models = 50
for version in versions:
    all_y_pred = []
    all_y_true = []

    for i in range(n_models):
        path = f"models/model_{version}_ex_{i}.pth"
        model_dict = torch.load(path)
        model = FashionCNN()
        model.to(device)
        model.load_state_dict(model_dict["model_state_dict"])
        model.eval()

        y_pred = []
        y_true = []

        for inputs, labels in test_loader_0123:
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs)

            output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
            y_pred.extend(output)
            labels = labels.data.cpu().numpy()
            y_true.extend(labels)

        all_y_pred.extend(y_pred)
        all_y_true.extend(y_true)

    classes = ("T-shirt/top", "Trouser", "Pullover", "Dress")
    metric = ConfusionMatrix(task="multiclass", num_classes=4)
    metric.update(torch.tensor(all_y_pred), torch.tensor(all_y_true))
    plt.figure()
    metric.plot()

    conf_matrix_tensor = metric.compute().detach().cpu()
    conf_matrix_numpy = conf_matrix_tensor.numpy()
    avg_confusion.append(conf_matrix_numpy / n_models)

plt.show()

### Compare them to new models

Now we calculate the distance between new models and those average confusion matrix

In [None]:
versions = ["01_0123_test", "23_0123_test"] # again, the generic name

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
n_models = 50
results = []
results_off_diag = []
results_diag = []
for version in versions:
    result = []
    result_off_diag = []
    result_diag = []
    for i in range(n_models):
        path = f"models/model_{version}_ex_{i}.pth"
        model_dict = torch.load(path)
        model = FashionCNN()
        model.to(device)
        model.load_state_dict(model_dict["model_state_dict"])
        model.eval()

        y_pred = []
        y_true = []

        for inputs, labels in test_loader_0123:
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs)

            output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
            y_pred.extend(output)

            labels = labels.data.cpu().numpy()
            y_true.extend(labels)

        classes = ("T-shirt/top", "Trouser", "Pullover", "Dress")
        metric = ConfusionMatrix(task="multiclass", num_classes=4)
        metric.update(torch.tensor(y_pred), torch.tensor(y_true))
        conf_matrix_tensor = metric.compute().detach().cpu()
        conf_matrix_numpy = conf_matrix_tensor.numpy()

        distance_AB = np.linalg.norm(conf_matrix_numpy - avg_confusion[0])
        distance_AC = np.linalg.norm(conf_matrix_numpy - avg_confusion[1])

        A_off_diag = conf_matrix_numpy - np.diag(np.diag(conf_matrix_numpy))
        B_off_diag = avg_confusion[0] - np.diag(np.diag(avg_confusion[0]))
        C_off_diag = avg_confusion[1] - np.diag(np.diag(avg_confusion[1]))

        distance_AB_off_diag = np.linalg.norm(A_off_diag - B_off_diag)
        distance_AC_off_diag = np.linalg.norm(A_off_diag - C_off_diag)

        distance_AB_diag = np.linalg.norm(conf_matrix_numpy - avg_confusion[0])
        distance_AC_diag = np.linalg.norm(conf_matrix_numpy - avg_confusion[1])

        if distance_AB < distance_AC:
            result.append(0)
        else:
            result.append(1)

        if distance_AB_off_diag < distance_AC_off_diag:
            result_off_diag.append(0)
        else:
            result_off_diag.append(1)

        if distance_AB_diag < distance_AC_diag:
            result_diag.append(0)
        else:
            result_diag.append(1)

    results.append(result)
    results_off_diag.append(result_off_diag)
    results_diag.append(result_diag)

### Print results

In [None]:
number_correct_0 = (np.array(results[0]) == 0).sum()
number_correct_1 = (np.array(results[1]) == 1).sum()

print(
    f"There are {number_correct_0} correct predictions for version {versions[0]} out of {len(results[0])} tests, {number_correct_0*100/len(results[0]):.2f}% success"
)
print(
    f"There are {number_correct_1} correct predictions for version {versions[1]} out of {len(results[1])} tests, {number_correct_1*100/len(results[1]):.2f}% success"
)

There are 37 correct predictions for version 01_0123 out of 50 tests, 74.00% success
There are 46 correct predictions for version 23_0123 out of 50 tests, 92.00% success


In [None]:
number_correct_0 = (np.array(results_off_diag[0]) == 0).sum()
number_correct_1 = (np.array(results_off_diag[1]) == 1).sum()

print(
    f"There are {number_correct_0} correct predictions for version {versions[0]} out of {len(results[0])} tests, {number_correct_0*100/len(results[0]):.2f}% success"
)
print(
    f"There are {number_correct_1} correct predictions for version {versions[1]} out of {len(results[1])} tests, {number_correct_1*100/len(results[1]):.2f}% success"
)

There are 42 correct predictions for version 01_0123 out of 50 tests, 84.00% success
There are 45 correct predictions for version 23_0123 out of 50 tests, 90.00% success


In [None]:
number_correct_0 = (np.array(results_diag[0]) == 0).sum()
number_correct_1 = (np.array(results_diag[1]) == 1).sum()

print(
    f"There are {number_correct_0} correct predictions for version {versions[0]} out of {len(results[0])} tests, {number_correct_0*100/len(results[0]):.2f}% success"
)
print(
    f"There are {number_correct_1} correct predictions for version {versions[1]} out of {len(results[1])} tests, {number_correct_1*100/len(results[1]):.2f}% success"
)

There are 37 correct predictions for version 01_0123 out of 50 tests, 74.00% success
There are 46 correct predictions for version 23_0123 out of 50 tests, 92.00% success
