In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from data import getDataLoaders
import torch.optim as optim
import datetime

In [13]:
# Load the ResNet-18 model from pytorch and display its architecture 
device = "cuda" if torch.cuda.is_available else "cpu"
resnet18 = models.resnet18(weights='DEFAULT').to(device)
resnet18

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In this pre-trained ResNet model, two key adjustments need to be made: the first and last layers. ResNet was originally trained on the ImageNet dataset, which consists of images that are 224x224 pixels and classified into a thousand categories. Here are the two main issues and their solutions:

1) **Input image size and normalization**: Pre-trained models expect input images to be normalized in a specific way, with mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are at least 224 pixels (cf [PyTorch ResNet Documentation](https://pytorch.org/hub/pytorch_vision_resnet/)). Moreover, the model expects the input images to be normalized with the following mean and standard deviation values (calculated from ImageNet data):
    - `mean = [0.485, 0.456, 0.406]`
    - `std = [0.229, 0.224, 0.225]`
   
***Solution***: Since CIFAR-10 images are 32x32 pixels, we can either resize them to match the pre-trained model's input size or adapt the first layer to fit the smaller input. Resizing to 224x224 can cause distortion, loss of detail, and significantly increase computation time due to the larger input size. Instead, modifying the first layer with a smaller 3x3 kernel is more efficient, preserving details and reducing computational cost compared to the original 7x7 kernel. We also omit the maxpool layer since its pooling operation is unnecessary for such small images.

2) **Output layer**: The final fully connected layer, `(fc): Linear(in_features=512, out_features=1000, bias=True)`, is designed to output 1,000 features, corresponding to the 1,000 classes of ImageNet. 

***Solution***: The CIFAR-10 dataset has 10 different classes. Therefore, we need to adjust the `out_features` parameter in the final fully connected layer: `(fc): Linear(in_features=512, out_features=10, bias=True).`

In [73]:
class ResNet(nn.Module):
    def __init__(self, weights=None):
        super().__init__()

        self.model = models.resnet18(weights=weights)
        self.model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.model.maxpool = nn.Identity()
        self.model.fc = nn.Linear(in_features=512, out_features=10, bias=True)

    def forward(self, x):
        return self.model(x)

Initially, we will test the model with default weights. Consequently, we should use the provided normalization values since the model was originally trained on a different dataset, and these values help ensure the input is consistent with what the model expects.

In [85]:
default_net = ResNet(weights='DEFAULT')

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

train_loader, val_loader = getDataLoaders(mean=mean, std=std)
train_loader.dataset

Files already downloaded and verified
Files already downloaded and verified


Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data/cifar-10-batches-py/
    Split: Train
    StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False)
           )

In [3]:
def model_accuracy(model, train_loader, val_loader, device):
    model = model.to(device=device)
    model.eval()
    for name, loader in zip((['train', 'val']), ([train_loader, val_loader])):
        correct = 0
        count = torch.zeros(10).to(device)
        total = len(loader.dataset)
        for imgs, labels in loader:
            imgs = imgs.to(device=device)
            labels = labels.to(device=device)
           
            with torch.inference_mode():
                outputs = model(imgs)
                pred = torch.argmax(outputs, dim=1)
            correct += int((pred == labels).sum())
            count += torch.bincount(pred, minlength=10)

        print(f"Score {name}: {correct} / {total}",
              f"\nAccuracy {name}: {(correct / total)*100:.2f}%",
              f"\nDistribution {name} (in %): [{', '.join([f'{(c / total * 100):.2f}' for c in count])}]")

        print()

In [86]:
model_accuracy(
    model=default_net,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device
)

Score train: 4973 / 50000 
Accuracy train: 9.95% 
Distribution train (in %): [0.00, 0.00, 2.74, 0.00, 0.44, 0.03, 85.88, 0.34, 0.00, 10.57]

Score val: 1015 / 10000 
Accuracy val: 10.15% 
Distribution val (in %): [0.00, 0.00, 2.73, 0.00, 0.46, 0.01, 85.37, 0.62, 0.00, 10.81]



As expected, the model needs to be trained properly to produce meaningful results on the CIFAR-10 dataset. Applying basic transformations alone isn’t enough. The current accuracy of 10% suggests that the model is effectively making random guesses, as we'd expect from choosing a class purely by chance.

More interestingly, the class distribution reveals a clear bias. The model is heavily favoring certain classes while barely predicting others.

Now, let's train the model on the CIFAR-10 dataset.

In [6]:
def training_loop(n_epochs, loader, model, optimizer, loss_fn, device):
    model = model.to(device)
    for epoch in range(1, n_epochs + 1):
        model.train()  
        total_loss = 0.0
        correct = 0
        total = 0
        
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            
            optimizer.zero_grad()

            outputs = model(imgs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
        
        avg_loss = total_loss / total
        accuracy = correct / total * 100
        
        
        print(f"{datetime.datetime.now()}, Epoch: {epoch}, Train Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

In [11]:
# CIFAR-10 normalization
train_loader, val_loader = getDataLoaders()
train_loader.dataset

Files already downloaded and verified
Files already downloaded and verified


Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./data/cifar-10-batches-py/
    Split: Train
    StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[tensor(0.4914), tensor(0.4822), tensor(0.4465)], std=[tensor(0.2470), tensor(0.2435), tensor(0.2616)], inplace=False)
           )

In [103]:
cifar_net = ResNet()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(cifar_net.parameters(), lr=0.01, momentum=0.9)
training_loop(
    n_epochs=10,
    loader=train_loader,
    model=cifar_net, 
    optimizer=optimizer,
    loss_fn=loss_fn,
    device=device
)

2024-09-17 20:13:17.859327, Epoch: 1, Train Loss: 1.3734, Accuracy: 50.46%
2024-09-17 20:14:14.363153, Epoch: 2, Train Loss: 0.8163, Accuracy: 71.27%
2024-09-17 20:15:11.154941, Epoch: 3, Train Loss: 0.5688, Accuracy: 79.89%
2024-09-17 20:16:07.971020, Epoch: 4, Train Loss: 0.3904, Accuracy: 86.29%
2024-09-17 20:17:04.778036, Epoch: 5, Train Loss: 0.2539, Accuracy: 91.13%
2024-09-17 20:18:01.656273, Epoch: 6, Train Loss: 0.1548, Accuracy: 94.46%
2024-09-17 20:18:58.527789, Epoch: 7, Train Loss: 0.1277, Accuracy: 95.55%
2024-09-17 20:19:55.473683, Epoch: 8, Train Loss: 0.0717, Accuracy: 97.47%
2024-09-17 20:20:52.591468, Epoch: 9, Train Loss: 0.0551, Accuracy: 98.19%
2024-09-17 20:21:49.774665, Epoch: 10, Train Loss: 0.0504, Accuracy: 98.27%


In [104]:
model_accuracy(
    model=cifar_net,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device
)

Score train: 49083 / 50000 
Accuracy train: 98.17% 
Distribution train (in %): [10.24, 9.82, 10.09, 10.58, 10.02, 9.19, 10.05, 9.88, 9.97, 10.16]

Score val: 7852 / 10000 
Accuracy val: 78.52% 
Distribution val (in %): [11.36, 9.25, 10.14, 13.28, 9.68, 6.56, 10.06, 9.02, 9.88, 10.77]



In [106]:
torch.save(cifar_net.state_dict(), './models/resnet18.pth')
model = ResNet()
model.load_state_dict(torch.load('./models/resnet18.pth'))

<All keys matched successfully>

The results are good considering only 10 epochs of training and minimal modifications! The distribution is as expected, with approximately 10% for each class. However, there is noticeable overfitting: the model's accuracy on the training set reaches about 98%, while its accuracy on the validation set drops below 80%.

From now on, we will test multiple models to prevent overfitting. To clearly visualize the performance of each model, we will use TensorBoard.

In [5]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [7]:
# Add the writer to the training loop 
def training_loop_tb(n_epochs, loader, model, optimizer, loss_fn, device, logdir="runs/tests"):
    model = model.to(device)
    for epoch in range(1, n_epochs + 1):
        model.train()  
        total_loss = 0.0
        correct = 0
        total = 0
        
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            
            optimizer.zero_grad()

            outputs = model(imgs)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
        
        avg_loss = total_loss / total
        accuracy = correct / total * 100
        
        writer.add_scalar('Loss/train', avg_loss, epoch)
        writer.add_scalar('Accuracy/train', accuracy, epoch)

In [None]:
downsample not None => BasicBlock[0] (downsample): Sequential(
        (0): Conv2d(in_features, out_features, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(out_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )

In [None]:
class BasicBlock(nn.Module):
    def __init__(self, in_features, out_features, stride=1, downsample=None):
        

In [9]:
class ResNetDropout(nn.Module):
    def __init__(self, weights=None):
        super().__init__()

        self.model = models.resnet18(weights=weights)
        self.model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.model.maxpool = nn.Identity()

        self.dropout = nn.Dropout(p=0.25)
        self.model.fc = nn.Linear(in_features=512, out_features=10, bias=True)
        
    def forward(self, x):
        out = self.model.conv1(x)
        out = self.model.bn1(out)
        out = self.model.relu(out)

        out = self.model.layer1(out)
        out = self.model.layer2(out)
        out = self.model.layer3(out)
        out = self.model.layer4(out)

        out = self.model.avgpool(out)
        out = torch.flatten(out, start_dim=1)

        out = self.dropout(out)
        
        return self.model.fc(out)

In [15]:
cifdrop_net = ResNetDropout()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(cifdrop_net.parameters(), lr=0.01, momentum=0.9)
training_loop_tb(
    n_epochs=10,
    loader=train_loader,
    model=cifdrop_net, 
    optimizer=optimizer,
    loss_fn=loss_fn,
    device=device
)
writer.flush()
writer.close()

In [None]:
class ResNetSkipLayers(nn.Module):
    def __init__(self, num_layers):
        self.model = models.resnet18(weights=weights)
        self.model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.model.maxpool = nn.Identity()

In [22]:
model_accuracy(
    model=cifdrop_net,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device
)

Score train: 49058 / 50000 
Accuracy train: 98.12% 
Distribution train (in %): [10.09, 10.11, 9.29, 10.10, 10.28, 10.02, 10.04, 10.38, 9.70, 9.99]

Score val: 7890 / 10000 
Accuracy val: 78.90% 
Distribution val (in %): [10.54, 10.31, 6.94, 10.47, 11.00, 9.92, 10.35, 11.77, 8.76, 9.94]

