## Dependencies

In [1]:
import torch
import torchvision
import matplotlib.pyplot as plt

In [2]:
# Import the best device available
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu')
print('Using device:', device)

# Load datasets
raw_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())

Using device: cuda
Files already downloaded and verified


## Util Functions

In [None]:
import torch.nn.functional as F
from torch.utils.data import DataLoader
from time import time

### Evaluation

In [None]:
def evaluate(model):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0

    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return correct / total

### Training Functions

In [3]:
def train_epoch(
    model: torch.nn.Module,
    train_dataloader: DataLoader,
    optimizer: torch.optim.Optimizer,
    loss_function,
    device: torch.device,
):
    """
    Training function. Loads the batch, perform forward pass, compute gradients and perform backward pass.
    Only difference is that you can pass the loss function as an argument.
    This allows to use a simple cross entropy loss function, or a more complex one including L2 or EWC regularization.
    """

    train_loss = 0.0
    model = model.to(device)
    model.train()
    for batch_idx, (data, target) in enumerate(train_dataloader):
        # move data and target to device
        data, target = data.to(device), target.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # do the forward pass
        output = model(data)

        # compute the loss
        loss = loss_function(output, target)

        # compute the gradients
        loss.backward()

        # perform the gradient step
        optimizer.step()

        # print statistics
        train_loss += loss.item()

    return train_loss / len(train_dataloader)

def validate_epoch(
    model: torch.nn.Module,
    val_dataloader: DataLoader,
    loss_function,
    device: torch.device,
):
    """
    Validates the model on the validation split.
    """

    val_loss = 0.0
    model = model.to(device)
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(val_dataloader):
            # move data and target to device
            data, target = data.to(device), target.to(device)

            # do the forward pass
            output = model(data)

            # compute the loss
            loss = loss_function(output, target)

            # print statistics
            val_loss += loss.item()

    return val_loss / len(val_dataloader)

def fit(
    model: torch.nn.Module,
    train_dataloader: DataLoader,
    val_dataloader: DataLoader,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler.LRScheduler,
    epochs: int,
    device: torch.device,
):
    """
    The fit method calls the train_epoch() method for a specified
    number of epochs and returns the train and validation losses.
    """

    # keep track of the losses in order to visualize them later
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        
        t = time() # current time

        # train function
        train_loss = train_epoch(
            model=model,
            train_dataloader=train_dataloader,
            optimizer=optimizer,
            device=device,
        )

        # validate model
        val_loss = validate_epoch(
            model=model,
            val_dataloader=val_dataloader,
            device=device
        )
        
        # step scheduler if needed
        if scheduler != None:
            scheduler.step()
        
        # append losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        dt = time() - t # time difference
        print(f"Epoch [{epoch + 1}/{epochs}]: train={round(train_loss, 4)} val={round(val_loss, 4)} ({round(dt, 1)}s)")

    return train_losses, val_losses


## Model architecture

In [4]:
class SmallNet(torch.nn.Module):
    def __init__(self):
        super(SmallNet, self).__init__()

        self._n_conv1 = 32
        self._n_conv2 = 64
        self._n_conv3 = 128
        self._n_ff = 420
        self._n_classes = 100
        self._drop_conv = 0.1
        self._drop_dense = 0.2
        
        self._flattened_size = self._n_conv3 * 4 * 4 # !! number of pooling layers !!

        self.conv1 = torch.nn.Conv2d(3, self._n_conv1, kernel_size=3, stride=1, padding=1)
        self.bn1 = torch.nn.BatchNorm2d(self._n_conv1)
        self.do1 = torch.nn.Dropout(self._drop_conv)
        
        self.conv2 = torch.nn.Conv2d(self._n_conv1, self._n_conv2, kernel_size=3, stride=1, padding=1)
        self.bn2 = torch.nn.BatchNorm2d(self._n_conv2)
        self.do2 = torch.nn.Dropout(self._drop_conv)
        
        self.conv3 = torch.nn.Conv2d(self._n_conv2, self._n_conv3, kernel_size=3, stride=1, padding=1)
        self.bn3 = torch.nn.BatchNorm2d(self._n_conv3)
        self.do3 = torch.nn.Dropout(self._drop_conv)
        
        self.fc1 = torch.nn.Linear(self._flattened_size, self._n_ff)
        self.do4 = torch.nn.Dropout(self._drop_dense)
        self.fc2 = torch.nn.Linear(self._n_ff, self._n_classes)

    def forward(self, x):
        """
        Input size is (batch_size, channels, height, width) = (., 3, 32, 32)
        """

        x = self.conv1(x)                        # (n_conv1, 32, 32)
        x = self.bn1(x)                          # same
        x = torch.nn.functional.relu(x)          # same
        x = self.do1(x)                          # same
        x = torch.nn.functional.max_pool2d(x, 2) # (n_conv1, 16, 16)

        x = self.conv2(x)                        # (n_conv2, 16, 16)
        x = self.bn2(x)                          # same
        x = torch.nn.functional.relu(x)          # same
        x = self.do2(x)                          # same
        x = torch.nn.functional.max_pool2d(x, 2) # (n_conv2, 8, 8)

        x = self.conv3(x)                        # (n_conv3, 8, 8)
        x = self.bn3(x)                          # same
        x = torch.nn.functional.relu(x)          # same
        x = self.do3(x)                          # same
        x = torch.nn.functional.max_pool2d(x, 2) # (n_conv3, 4, 4)

        x = x.view(-1, self._flattened_size) # (flattened_size)
        x = self.fc1(x)                      # (n_ff)
        x = torch.nn.functional.relu(x)      # same
        x = self.do4(x)                      # same

        x = self.fc2(x)                      # (n_classes)

        return x

print("Model parameters: ", sum(p.numel() for p in SmallNet().parameters()))

Model parameters:  996376


### Training the model

In [5]:
# Initialize model
model = SmallNet()
model.to(device)

# Loss, optimizer and scheduler
n_epochs = 330

#optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

#scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2)

# split train and validation sets
val_size = int(0.1 * len(raw_dataset))
train_size = len(raw_dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(raw_dataset, [train_size, val_size])

# create new dataloaders for training and validation sets
BATCH_SIZE = 128
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(f'Training Samples  : {len(train_dataset)}')
print(f'Validation Samples: {len(val_dataset)}')

print('Training model...')
train_losses, val_losses = fit(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    epochs=n_epochs,
    device=device
)

# Plot loss as fct of epoch
print('Plotting curves...')
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.legend()
plt.title('Training')

print('Done!');

Training Samples  : 45000
Validation Samples: 5000
Training model...
Epoch [1/330]: train=3.9301 val=3.5361 (8.4s)


KeyboardInterrupt: 

### ChadNet

In [None]:
# save the model on a file
torch.save(model.state_dict(), 'chad_net.pt')

loaded_model = ChadNet()
loaded_model.load_state_dict(torch.load('chad_net.pt', weights_only=True))
evaluate(loaded_model)

The model has 996376 parameters
[1m[91mAccuracy on the test set: 50.48%[0m


---

# Discussion

There are two major constraints for this HW, training time constraint (approximately 1 hour) and model size constraint (1M weights). 1M is not much, for reference I found a model online achieving 90% accuracy on the same dataset with 10M weights.

In a nutshell, the timing constraint forces us to have an efficient and compact model structure, otherwise it will take more time to train. We need a compromise in order to maximize final performance. Given the size AND timing constraint, it is not necessarily the largest model which will perform the best.

The following list summarizes the major improvements made to the initial model:
- initial model has 4% test accuracy (100 classes, so random accuracy is 1%)
- increasing batch size -> 35%
- adding a conv layer -> 43%
- more dense weights -> 48%
- step scheduler with 3x epochs -> 52%
- maximum number of epochs -> ??

## Model Architecture

I improved on the base architecture provided mainly by adding one extra convolutional layer, and increasing the neuron count in the first feed-forward dense layer.

### Convolutional Layers

The addition of a third convolutional layer yields an improvement in test accuracy of ~7%. This third convolution layer allows the model to pick up on higher level image patterns, so it is able to detect objects more precisely.

Every convolution layer is followed by a maximum pooling layer which decreases image size by a factor of 4. However, the convolution layer only increases the amount of channels by 2. This means that compared to the base model, my model has 2x less floats in its hidden layer in between the conv and dense layers (`_n_flattened_size`). The amount of weights in the first FF layer scales quadratically with this hidden layer size, so adding this third convolution layer permits a fair decrease in parameter count.

### Dense Layers

The previous comment about adding a third conv layer allowed me to increase the number of weights in the first dense layer. It is generally accepted that 2 feed-forward layers at the end of the model is enough for classification, so I chose not to add another dense layer. We can still increase weights in the existing dense layers. Should we?

We should choose the dense weight count according to the rest of the architecture. If the model has a small convolutional part, it's useless to give it a huge dense layer, in the same way that it is foolish to have a very big and complex convolutional part followed by very limited/small dense layers. We try to avoid layers "bottlenecking" other layers.

I performed two 10-epochs trains on two different models:
- a model with 200 neurons in the dense HL (~500K weights)
- a model with 400 neurons in the dense HL (~1M weights)

I found that the largest model performs much better (48% against 42%). Given the model size difference, it's not a huge surprise... This suggests we could have used even more weights in the dense layer, since I obviously chose the dense weight count so that the parameter count falls right under the size constraint.

We will rediscuss this in the "Training Time" section.

### Batch Normalization

Batch normalization normalizes the output of some layer. This helps the model converge faster and combats vanishing gradient. In theory, normalizing layer outputs permits the use of higher training rates during training as it fixes internal covariate shift.

For convolutional neural layers it is usually recommended to put them in between the convolutional layer and the activation layer (ReLU in our case).

### Dropout Layers
Dropout layers are usually good for regularization in order to avoid overfitting. It looks like the model is overfitting quite fast, since the loss keeps decreasing but the test accuracy does not increase when going from 10 to 30 training epochs.

Test accuracy decreases for positive dropout rates for convolutional layers; it is usually preferred to add dropout layers inside dense feed-forward layers so I just removed the dropouts from the convolutional layers.

A standard value for dropout is 20-50%. I found that a 20% dropout layer added in between the feed-forward layers yields a 2-3% increase in test accuracy. This most likely helps the model avoid overfitting.

We can see on the plotted loss curves that it does reduce overfitting, but it does reduce the test accuracy in the end. Maybe during a longer training time however, this effect. Unfortunately, I did not take the time to do a 1h long training, so I decided to only keep the dropout layer in the dense layer.

## Training Function & Parameters

### Batch Size

The first parameter which I changed was batch size. Model performance highly depends on batch size, because larger batch sizes produce gradient descents that are more aligned with the "true" gradient. This tends to accelerate convergence. Batch sizes that are way too large tend to overfit training data.

A higher batch size also has a nice side effect of speeding up training. I chose a batch size of 128 as my GPU could handle it (faster training), and as the final model performance didn't suffer from increasing it (initial batch size is 32).

### Optimizer

I tried three different optimizers (Adam, Adaw with weight decay and SGD). SGD is usually the go-to for classification tasks, and yielded slightly better results than the other two.

### Learning Rate

When using 10 epochs for training, any initial learning rate above $10^{-3}$ yields a significant decrease in test accuracy. Lower values seem to have less influence on final test accuracy. This suggests that $10^{-3}$ is a good initial learning rate, and it can then be successively lowered during training.

I used a **step learning rate scheduler** which multiplies learning rate by a constant $\gamma$ every $N_{steps}$ epochs. It permits to have a high learning rate at the beginning of the training phase and gradually decrease it. 

I found that 20 epochs at constant learning rate of $10^{-3}$ is 3~5% less performant (in test accuracy) than first 10 epochs with a LR of $ 10^{-3} $ and then 10 epochs with a LR of $ 2 \cdot 10^{-4} $. This can be achieved using a step size of 10 epochs and a $\gamma$ value of 20%. Increasing epoch number to 30 using the same scheduler yields again an increase in test accuracy of ~2%.

The value for $\gamma$ should be sufficiently large to be meaningful every step size, otherwise the training would be too long. If it's too low, we are missing out on performance. I settled with $\gamma = 0.2$, changing it so something higher and lower (0.1-0.3) didn't seem to make much of a difference.

### Training Time

Models with more weights tend to have better performance when sufficiently trained, however they are slower to converge. Smaller models tend to converge faster, but might have a lower performance potential. We have a time constraint now, so we need to determine if it's best to "heavily train" a smaller model, or "lightly train" a bigger model.

Going back to the measurements I did with the dense layer size, I found out that both take almost the same amount of time to train. The largest model takes 20% more time to train even though it has 200% the size. It is likely due to my GPU being able to handle the small size of both models, maybe most of the training time is due to non-computation overheads. 20% training time is worth the 6% accuracy gain, so I'm definitely sticking to the 1M model.

### Final Training Time

I did my tests on my own laptop equipped with a RTX4060 GPU. Every epoch takes approximately ~10 seconds taking into account validation, a rule of three allows me to estimate the number of epochs I need to train the final model for, so that it does not exist the 1 hour limit. **I will train the final model for 330 epochs**.

### Training Function

The training function is largely inspired from the `fit` function provided in TP13. I chose to keep the loss function hardcoded to cross entropy to speedup the training by a factor of ~50%. Cross entropy as loss function generally performs good in classification tasks.

### Overfitting

It is easy to see that the model quite rapidly overfits. With no scheduler, the resulting test accuracy is very much the same whether you train the model for 10, 20 or 30 epochs, even though the loss keeps decreasing. This suggests that the model is overfitting the training dataset and not generalizing.

For that reason, **I split the train dataset into a train and validation dataset to validate the model at every epoch against unseen data** in order to see how much the model overfits. Ideally, the accuracy against the validation and training dataset should be the same but it's not always the case. I set the validation dataset to 10%. Reducing training dataset size by 10% induced a loss of performance of about 1~2%.

This permits two things:
- Make sure that the model does not overfit. If it does, we can add more dropouts, etc.
- Make sure that the model has learned as much as it could by the end of every step size before decreasing $\gamma$

After plotting validation loss, I realized that a step size of 10 for my step learning rale scheduler was a bit too much because the model tends to overfit above 5 epochs at a given learning rate value. I lowered the step size down to 5.

### Data Augmentation

One major problem with deep learning is having a high quality dataset. When the dataset is too small, we can generate pseudo-new data by augmenting data. Pytorch allows to virtually increase dataset size by applying data augmenting transformations to the images, like:
- random horizontal flips and images crops...
- random contrast/saturation/exposition changes...
It can also help the model less overfit training data as it generates never-seen data.

Data augmentation drastically increases training time. Using only 3 transformations (h-flips, cropping and color jittering) increasing epoch duration by a factor of >5 (from 10-12 to 60-75 seconds). If there was no time constraint, I would have definitely utilized data augmentation because we definitely expect it to increase final model performance.

Training a model for 30 epochs did not improve on the test accuracy at all, with respect to the scenario where I didn't use data augmentation, so I decided to ignore data augmentation.

## Other Possibilities

Here are some points I have not explored but could be interesting to further improve on the model performance.

### Testing other convolutional/activation layer types
Conv2D is not the only convolutional layer structure that exists, maybe some other conv layer structures would be more performant on this classification task. Similarily, maybe other activation functions would yield better performant, for instance a Rectified-Linear-Unit activation function.


### Data Normalization

Pytorch DataLoader's allow to apply transforms to dataset before using them, it can be used for data augmentation and for data processing. I did not try normalizing the data used for training.

### Transfer Learning

I gave a quick try to transfer learning, using resnet18 finetuned on its two last layers.

As we saw in TP13, a very common approach to deep learning is to not start from scratch but rather finetune an existing pretrained model onto a specific dataset. It is because patterns learnt during the (sometimes very long) pretraining phase are usually very much interesting in other datasets/use cases. The pretrained model can learn about shapes and objects, and finetuning it using two dense feed-forward layers can have him understand how to classify these shapes/objects.

With little effort (transposing TP13 to CIFAR100) I could easily get >25% test accuracy. I eventually settled with training my own model since resnet18 has way too much parameters (18M) compared to the constraint, but transfer learning is definitely something I could have explored more. You can find the code below

In [None]:
n_classes = 100

from torchvision.models import resnet18

class FinetuningLastLayer(torch.nn.Module):

    def __init__(self):
        super().__init__()
        # load pretrained resnet18
        self.network = resnet18(n_classes)
        self.network.to(device)

        # freeze the parameters of the network.
        for param in self.network.parameters():
            param.requires_grad = False

        # replace the last layer with a randonly initializied one and with `num_classes` number of output neurons.
        self.network.fc = torch.nn.Linear(self.network.fc.in_features, n_classes)

    def forward(self, x):
        return self.network(x)

# too much parameters!!!
print("Model parameters: ", sum(p.numel() for p in FinetuningLastLayer().network.parameters()))

Model parameters:  11227812


In [12]:
# save the model on a file
torch.save(model.state_dict(), 'resnet.pt')

loaded_model = FinetuningLastLayer()
loaded_model.load_state_dict(torch.load('resnet.pt', weights_only=True))
evaluate(loaded_model)

The model has 11227812 parameters
[1m[91mAccuracy on the test set: 25.34%[0m
