In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from dataloader import get_loaders
from models import CNNRegressor, CNNClassifier, VGGRegressor

In [2]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [3]:
import os
directory = "trained_models"
if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created.")
else:
    print(f"Directory '{directory}' already exists.")

Directory 'trained_models' already exists.


### Regressor Modelling

The given task is such that we are given an input image, and we are predicting the sum of the numbers represented in the image. If there is no guarantee as to whether the test set contains exactly 4 numbers, the typical classifier approach will likely fail. As a result, we can modify the network to act as a regressor with a single output and the output value being rounded to get an integer representation. Hence, during training, the model will use MSE Loss (typically used for regression tasks) and will round the number to closest integer during inference to calculate accuracy. 

Note that as a result of this approach, a low MSE Loss need not translate to a good accuracy. Consider for example, our model predicts 24.6 as an output while the ground truth is 24. In this case, the MSE Loss will be low, but the accuracy need not be as rounding changes the output. Also note that it is not possible to train the model based on rounded outputs as rounding is a step function with zero gradient almost everywhere making it useless for learning. 

In [4]:
def train_model(train_loader, val_loader, model, criterion, optimizer, epochs, device, type="regressor"):
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            if type=="classifier":
                labels = labels.to(torch.long)

            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_acc = 0.0
        model.eval()
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                if type=="classifier":
                    outputs = F.softmax(outputs, dim=-1)
                    outputs = torch.argmax(outputs, dim=1)
                accuracy = (torch.sum(torch.round(outputs)==labels)*100/len(labels)).item()
                val_acc += accuracy

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Accuracy: {val_acc/len(val_loader):.4f}")

In [5]:
# Hyperparameters
device = "cuda" if torch.cuda.is_available() else "cpu"
train_fraction = 0.8
batch_size = 64
seed = 42
epochs = 15
learning_rate = 1e-3
seed_everything(seed)

In [6]:
model = CNNRegressor().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loader, val_loader = get_loaders(train_fraction, batch_size, seed)
train_model(train_loader, val_loader, model, criterion, optimizer, epochs, device)

Epoch 1/15, Train Loss: 34.8954, Val Accuracy: 7.8624
Epoch 2/15, Train Loss: 21.6476, Val Accuracy: 9.5246
Epoch 3/15, Train Loss: 15.8796, Val Accuracy: 10.4167
Epoch 4/15, Train Loss: 12.8713, Val Accuracy: 11.4029
Epoch 5/15, Train Loss: 11.0107, Val Accuracy: 11.9847
Epoch 6/15, Train Loss: 9.5900, Val Accuracy: 11.7797
Epoch 7/15, Train Loss: 8.7342, Val Accuracy: 9.1977
Epoch 8/15, Train Loss: 8.6009, Val Accuracy: 12.7992
Epoch 9/15, Train Loss: 7.5465, Val Accuracy: 11.7298
Epoch 10/15, Train Loss: 7.2628, Val Accuracy: 11.5747
Epoch 11/15, Train Loss: 6.6896, Val Accuracy: 10.0898
Epoch 12/15, Train Loss: 5.4973, Val Accuracy: 12.2230
Epoch 13/15, Train Loss: 6.7779, Val Accuracy: 12.1509
Epoch 14/15, Train Loss: 6.1146, Val Accuracy: 11.8850
Epoch 15/15, Train Loss: 4.6103, Val Accuracy: 11.7686


In [7]:
## Calculating overall accuracy of the validation set
model.eval()
total_len = 0
total_correct = 0
with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        correct_count = (torch.sum(torch.round(outputs)==y)).item()
        total_len += len(y)
        total_correct += correct_count
accuracy = (total_correct/total_len)*100
print(f"Overall Accuracy of Regressor on Validation Split: {accuracy:.2f}%")

total_len = 0
total_correct = 0
with torch.no_grad():
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        correct_count = (torch.sum(torch.round(outputs)==y)).item()
        total_len += len(y)
        total_correct += correct_count
accuracy = (total_correct/total_len)*100
print(f"Overall Accuracy of Regressor on Training Split: {accuracy:.2f}%")

Overall Accuracy of Regressor on Validation Split: 11.77%
Overall Accuracy of Regressor on Training Split: 22.30%


In [8]:
torch.save(model.state_dict(), f"./trained_models/cnnregressor_{seed}.pt")

### Classifier Modelling

When we are assuming that the input image will always have 4 numbers as that is the only value appearing in the dataset. In that case, we can model the network as a classifier with the outputs ranging from 0 to 36. For this case, we can use the cross entropy loss which is commonly used for classification task. Empirically we observe that the model performance on the validation split is lower for the classifier than it is for the regressor. This might be because regression operates in the continuous space (just like the input) and this possibly allows for a smoother optimization. It is not possible to verify why the regressor works better than the classifier case. 

In [9]:
model = CNNClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loader, val_loader = get_loaders(train_fraction, batch_size, seed)
train_model(train_loader, val_loader, model, criterion, optimizer, epochs, device, type="classifier")

Epoch 1/15, Train Loss: 3.1989, Val Accuracy: 6.6766
Epoch 2/15, Train Loss: 3.0966, Val Accuracy: 7.7238
Epoch 3/15, Train Loss: 2.9013, Val Accuracy: 8.5051
Epoch 4/15, Train Loss: 2.7367, Val Accuracy: 9.0148
Epoch 5/15, Train Loss: 2.5567, Val Accuracy: 8.8431
Epoch 6/15, Train Loss: 2.3252, Val Accuracy: 8.3112
Epoch 7/15, Train Loss: 2.0238, Val Accuracy: 8.1727
Epoch 8/15, Train Loss: 1.7025, Val Accuracy: 7.7903
Epoch 9/15, Train Loss: 1.3624, Val Accuracy: 7.1199
Epoch 10/15, Train Loss: 1.0731, Val Accuracy: 7.9898
Epoch 11/15, Train Loss: 0.8112, Val Accuracy: 7.7349
Epoch 12/15, Train Loss: 0.6251, Val Accuracy: 7.5078
Epoch 13/15, Train Loss: 0.5074, Val Accuracy: 7.1144
Epoch 14/15, Train Loss: 0.4010, Val Accuracy: 7.3415
Epoch 15/15, Train Loss: 0.3226, Val Accuracy: 7.5964


In [10]:
## Calculating overall accuracy of the validation set
model.eval()
total_len = 0
total_correct = 0
with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        outputs = F.softmax(outputs, dim=-1)
        outputs = torch.argmax(outputs, dim=1)
        correct_count = (torch.sum(torch.round(outputs)==y)).item()
        total_len += len(y)
        total_correct += correct_count
accuracy = (total_correct/total_len)*100
print(f"Overall Accuracy of Classifer on Validation Split: {accuracy:.2f}%")

total_len = 0
total_correct = 0
with torch.no_grad():
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        outputs = F.softmax(outputs, dim=-1)
        outputs = torch.argmax(outputs, dim=1)
        correct_count = (torch.sum(torch.round(outputs)==y)).item()
        total_len += len(y)
        total_correct += correct_count
accuracy = (total_correct/total_len)*100
print(f"Overall Accuracy of Classifer on Training Split: {accuracy:.2f}%")

Overall Accuracy of Classifer on Validation Split: 7.60%
Overall Accuracy of Classifer on Training Split: 66.07%


In [11]:
torch.save(model.state_dict(), f"./trained_models/cnnclassifier_{seed}.pt")

The same behaviour was observed across seeds (0, 10, 42), indicating a consistent behaviour. Also note that early stopping based on validation loss has not been implemented for the models (even though the validation accuracy is higher at some early stages) to ensure the train loss is sufficiently low. Training for further epochs leads to a decrease in the validation accuracy indicating overfitting. Hence, the number of epochs was chosen to be 15. Note that the training accuracy is very high for the classifier approach, but the validation is not, indicating a possibility that this approach is overfitting. 

### Transfer Learning With VGG

Owing to the success of the regressor approach, a third type of network was created using transfer learning using VGG (a popular CNN network which achieves fairly high accuracy across various tasks). The convolutional backbone and the average pooling layers of the network are frozen and a classifier is added on top which has one output as the previous CNN regressor case. Note that a convolutional layer is also prepended to ensure that the MNIST dataset with one channel is compatible with the VGG network that is trained on 3 channel RGB images. This approach is computationally more expensive, but was tried out to see whether the pretraining of the VGG leads to better results.  

In [None]:
model = VGGRegressor().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loader, val_loader = get_loaders(train_fraction, batch_size, seed)
train_model(train_loader, val_loader, model, criterion, optimizer, epochs, device)

Epoch 1/15, Train Loss: 37.5322, Val Accuracy: 7.9344
Epoch 2/15, Train Loss: 28.4345, Val Accuracy: 8.3610
Epoch 3/15, Train Loss: 27.7135, Val Accuracy: 8.6104
Epoch 4/15, Train Loss: 26.6609, Val Accuracy: 8.8597
Epoch 5/15, Train Loss: 26.2694, Val Accuracy: 8.3998
Epoch 6/15, Train Loss: 25.7796, Val Accuracy: 8.8375
Epoch 7/15, Train Loss: 25.2354, Val Accuracy: 9.4415
Epoch 8/15, Train Loss: 25.5707, Val Accuracy: 9.2974
Epoch 9/15, Train Loss: 24.5139, Val Accuracy: 9.1201
Epoch 10/15, Train Loss: 24.7005, Val Accuracy: 9.1534
Epoch 11/15, Train Loss: 24.3212, Val Accuracy: 9.2365


In [8]:
## Calculating overall accuracy of the validation set
model.eval()
total_len = 0
total_correct = 0
with torch.no_grad():
    for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        correct_count = (torch.sum(torch.round(outputs)==y)).item()
        total_len += len(y)
        total_correct += correct_count
accuracy = (total_correct/total_len)*100
print(f"Overall Accuracy of Regressor on Validation Split: {accuracy:.2f}%")

total_len = 0
total_correct = 0
with torch.no_grad():
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        correct_count = (torch.sum(torch.round(outputs)==y)).item()
        total_len += len(y)
        total_correct += correct_count
accuracy = (total_correct/total_len)*100
print(f"Overall Accuracy of Regressor on Training Split: {accuracy:.2f}%")

Overall Accuracy of Regressor on Validation Split: 7.83%
Overall Accuracy of Regressor on Training Split: 8.67%


In [23]:
torch.save(model.state_dict(), f"./trained_models/vggregressor_{seed}.pt")

### Results Across Seeds

#### Validation Accuracy

| Seed | CNN Regressor | CNN Classifier | VGG Regressor |
|------|---------------|----------------|---------------|
| 0    | 11.42%        | 6.70%          | 10.23%        |
| 10   | 10.92%        | 8.00%          | 7.83%         |
| 42   | 10.65%        | 7.70%          | 9.12%         |

#### Training Accuracy

| Seed | CNN Regressor | CNN Classifier | VGG Regressor |
|------|---------------|----------------|---------------|
| 0    | 21.90%        | 52.36%         | 9.98%         |
| 10   | 22.22%        | 76.92%         | 8.67%         |
| 42   | 18.16%        | 68.08%         | 9.29%         |