# Using pre-trained CNN

In this lab, we will see:

- Zero-shot performance of pre-trained backbone
- Use pre-trained CNN as backbone
- Fine-tuning the pre-trained CNN

In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

In [None]:
batch_size = 64
lr = 0.01
epochs = 10
device = torch.device("cuda") # to use the GPU

In [None]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
# create a split for train/validation. We can use early stop
trainset, valset = torch.utils.data.random_split(dataset, [40000, 10000])

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2,
                                          drop_last=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                          shuffle=False, num_workers=2,
                                          drop_last=False)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2,
                                          drop_last=False)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:05<00:00, 29.7MB/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


## Load a pre-defined network with pretrained weights



In [None]:
net = models.resnet18(pretrained=True)
# override the fc layer of the network since it is of 1000 classes by default (ImageNet)
net.fc = nn.Linear(512, 10)
net.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 103MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
# count the trainable parameters of the model
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_trainable_parameters(net)

11181642

In [None]:
# frozen all the weights of the network, except for fc ones
for param in net.parameters():
    param.requires_grad = False
net.fc.weight.requires_grad = True
net.fc.bias.requires_grad = True
count_trainable_parameters(net)

5130

In [None]:
# define train and test function
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    losses = []
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 500 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
        losses.append(loss.item())
    return np.mean(losses)

def test(model, device, test_loader, val=False):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    mode = "Val" if val else "Test"
    print('\{} set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        mode,
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    test_acc = correct / len(test_loader.dataset)
    return test_loss, test_acc

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-04)

In [None]:
# the main loop
train_losses = []
val_losses = []
val_accuracies = []
model_state_dict = None

for epoch in range(1, epochs + 1):
    train_loss = train(net, device, trainloader, optimizer, epoch)
    train_losses.append(train_loss)
    val_loss, val_acc = test(net, device, valloader, val=True)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

\Val set: Average loss: 331.4472, Accuracy: 3523/10000 (35%)

\Val set: Average loss: 333.1411, Accuracy: 3540/10000 (35%)

\Val set: Average loss: 338.2413, Accuracy: 3388/10000 (34%)

\Val set: Average loss: 309.4349, Accuracy: 3679/10000 (37%)

\Val set: Average loss: 346.1276, Accuracy: 3300/10000 (33%)

\Val set: Average loss: 324.6189, Accuracy: 3504/10000 (35%)

\Val set: Average loss: 327.5846, Accuracy: 3550/10000 (36%)

\Val set: Average loss: 315.0072, Accuracy: 3676/10000 (37%)

\Val set: Average loss: 337.2147, Accuracy: 3438/10000 (34%)

\Val set: Average loss: 321.2299, Accuracy: 3531/10000 (35%)



In [None]:
test_loss, test_acc = test(net, device, testloader)

\Test set: Average loss: 343.2623, Accuracy: 3477/10000 (35%)



## Add additional layer to the pre-trained model


In [None]:
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

## Fine-tuning some part of the CNN (not only the classifier)

In [None]:
# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.001}
fc_params = {'params': net.fc.parameters(), 'lr': 0.1}

# Assuming you are using an Adam optimizer
optimizer = torch.optim.SGD([layer4_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
def get_results(net, trainloader, optimizer, valloader, testloader, device):
    train_losses = []
    val_losses = []
    val_accuracies = []
    model_state_dict = None

    for epoch in range(1, epochs + 1):
        train_loss = train(net, device, trainloader, optimizer, epoch)
        train_losses.append(train_loss)
        val_loss, val_acc = test(net, device, valloader, val=True)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

    test_loss, test_acc = test(net, device, testloader)
    return train_losses, val_losses, val_accuracies, test_loss, test_acc

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 286.7457, Accuracy: 3449/10000 (34%)

\Val set: Average loss: 244.8259, Accuracy: 4245/10000 (42%)

\Val set: Average loss: 218.5828, Accuracy: 5079/10000 (51%)

\Val set: Average loss: 202.0690, Accuracy: 5539/10000 (55%)

\Val set: Average loss: 192.5636, Accuracy: 5813/10000 (58%)

\Val set: Average loss: 191.3492, Accuracy: 5800/10000 (58%)

\Val set: Average loss: 183.2410, Accuracy: 6012/10000 (60%)

\Val set: Average loss: 179.0106, Accuracy: 6155/10000 (62%)

\Val set: Average loss: 174.7068, Accuracy: 6202/10000 (62%)

\Val set: Average loss: 168.6194, Accuracy: 6304/10000 (63%)

\Test set: Average loss: 169.8140, Accuracy: 6293/10000 (63%)

Test accuracy: 0.629


## Exercise 1

How many layers it is better to fine-tune?

It is better to update all the weights of the model?

In [None]:
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.001}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.001}
fc_params = {'params': net.fc.parameters(), 'lr': 0.1}

# Assuming you are using an Adam optimizer
optimizer = torch.optim.SGD([layer4_params, layer3_params, fc_params], momentum=0.9, weight_decay=1e-04)



In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 222.1107, Accuracy: 5037/10000 (50%)

\Val set: Average loss: 174.3838, Accuracy: 6211/10000 (62%)

\Val set: Average loss: 159.0620, Accuracy: 6583/10000 (66%)

\Val set: Average loss: 146.5400, Accuracy: 6771/10000 (68%)

\Val set: Average loss: 137.4820, Accuracy: 7009/10000 (70%)

\Val set: Average loss: 136.9796, Accuracy: 7037/10000 (70%)

\Val set: Average loss: 129.9981, Accuracy: 7133/10000 (71%)

\Val set: Average loss: 126.1389, Accuracy: 7218/10000 (72%)

\Val set: Average loss: 123.7831, Accuracy: 7278/10000 (73%)

\Val set: Average loss: 121.9613, Accuracy: 7380/10000 (74%)

\Test set: Average loss: 117.7501, Accuracy: 7503/10000 (75%)

Test accuracy: 0.750


In [None]:
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer2 parameters
for param in net.layer2.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.001}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.001}
layer2_params = {'params': net.layer2.parameters(), 'lr': 0.001}
fc_params = {'params': net.fc.parameters(), 'lr': 0.1}

# Assuming you are using an Adam optimizer
optimizer = torch.optim.SGD([layer4_params, layer3_params, layer2_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 200.9513, Accuracy: 5459/10000 (55%)

\Val set: Average loss: 147.7013, Accuracy: 6760/10000 (68%)

\Val set: Average loss: 131.3643, Accuracy: 7198/10000 (72%)

\Val set: Average loss: 120.4690, Accuracy: 7429/10000 (74%)

\Val set: Average loss: 117.3681, Accuracy: 7493/10000 (75%)

\Val set: Average loss: 112.1097, Accuracy: 7584/10000 (76%)

\Val set: Average loss: 103.0345, Accuracy: 7780/10000 (78%)

\Val set: Average loss: 103.8277, Accuracy: 7794/10000 (78%)

\Val set: Average loss: 101.6844, Accuracy: 7835/10000 (78%)

\Val set: Average loss: 101.3696, Accuracy: 7816/10000 (78%)

\Test set: Average loss: 96.6028, Accuracy: 7977/10000 (80%)

Test accuracy: 0.798


In [None]:
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer2 parameters
for param in net.layer2.parameters():
    param.requires_grad = True

# Unfreeze layer1 parameters
for param in net.layer1.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.001}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.001}
layer2_params = {'params': net.layer2.parameters(), 'lr': 0.001}
layer1_params = {'params': net.layer1.parameters(), 'lr': 0.001}
fc_params = {'params': net.fc.parameters(), 'lr': 0.1}

# Assuming you are using an Adam optimizer
optimizer = torch.optim.SGD([layer4_params, layer3_params, layer2_params, layer1_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 219.0783, Accuracy: 4824/10000 (48%)

\Val set: Average loss: 149.3416, Accuracy: 6770/10000 (68%)

\Val set: Average loss: 133.9396, Accuracy: 7075/10000 (71%)

\Val set: Average loss: 116.9873, Accuracy: 7509/10000 (75%)

\Val set: Average loss: 120.6367, Accuracy: 7470/10000 (75%)

\Val set: Average loss: 105.7238, Accuracy: 7742/10000 (77%)

\Val set: Average loss: 101.5547, Accuracy: 7884/10000 (79%)

\Val set: Average loss: 100.6822, Accuracy: 7869/10000 (79%)

\Val set: Average loss: 99.8254, Accuracy: 7877/10000 (79%)

\Val set: Average loss: 94.1711, Accuracy: 8004/10000 (80%)

\Test set: Average loss: 90.8844, Accuracy: 8101/10000 (81%)

Test accuracy: 0.810


## Exercise 2

Try to change the hyper-parameters of the fine-tuning (e.g. lr of CNN layers and lr of the fc layers) and/or network architecture

In [None]:
# Model choosen is the one with 2 layer finetuned
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
# Changing fine tuning parameters
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.01}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.01}
fc_params = {'params': net.fc.parameters(), 'lr': 0.01}

optimizer = torch.optim.SGD([layer4_params, layer3_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 250.2112, Accuracy: 4303/10000 (43%)

\Val set: Average loss: 184.9564, Accuracy: 5956/10000 (60%)

\Val set: Average loss: 170.5542, Accuracy: 6214/10000 (62%)

\Val set: Average loss: 156.9608, Accuracy: 6598/10000 (66%)

\Val set: Average loss: 149.0362, Accuracy: 6739/10000 (67%)

\Val set: Average loss: 139.3544, Accuracy: 6954/10000 (70%)

\Val set: Average loss: 134.9567, Accuracy: 7038/10000 (70%)

\Val set: Average loss: 130.8797, Accuracy: 7200/10000 (72%)

\Val set: Average loss: 127.3314, Accuracy: 7292/10000 (73%)

\Val set: Average loss: 127.7791, Accuracy: 7220/10000 (72%)

\Test set: Average loss: 116.7587, Accuracy: 7469/10000 (75%)

Test accuracy: 0.747


In [None]:
# Model choosen is the one with 2 layer finetuned
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
# Changing fine tuning parameters
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.1}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.1}
fc_params = {'params': net.fc.parameters(), 'lr': 0.001}

optimizer = torch.optim.SGD([layer4_params, layer3_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 142.0381, Accuracy: 6914/10000 (69%)

\Val set: Average loss: 132.4523, Accuracy: 7025/10000 (70%)

\Val set: Average loss: 126.4757, Accuracy: 7232/10000 (72%)

\Val set: Average loss: 117.3408, Accuracy: 7445/10000 (74%)

\Val set: Average loss: 119.4196, Accuracy: 7361/10000 (74%)

\Val set: Average loss: 112.8146, Accuracy: 7527/10000 (75%)

\Val set: Average loss: 113.6506, Accuracy: 7496/10000 (75%)

\Val set: Average loss: 113.1269, Accuracy: 7486/10000 (75%)

\Val set: Average loss: 107.8911, Accuracy: 7610/10000 (76%)

\Val set: Average loss: 108.2420, Accuracy: 7606/10000 (76%)

\Test set: Average loss: 97.9299, Accuracy: 7807/10000 (78%)

Test accuracy: 0.781


In [None]:
# Model choosen is the one with 2 layer finetuned
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
net.fc = nn.Linear(128, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    net.fc
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
# Changing fine tuning parameters
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.1}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.1}
fc_params = {'params': net.fc.parameters(), 'lr': 0.1}

optimizer = torch.optim.SGD([layer4_params, layer3_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 212.5698, Accuracy: 5229/10000 (52%)

\Val set: Average loss: 157.4727, Accuracy: 6654/10000 (67%)

\Val set: Average loss: 157.2708, Accuracy: 6659/10000 (67%)

\Val set: Average loss: 138.9392, Accuracy: 7021/10000 (70%)

\Val set: Average loss: 141.5030, Accuracy: 6915/10000 (69%)

\Val set: Average loss: 154.7159, Accuracy: 6621/10000 (66%)

\Val set: Average loss: 138.0316, Accuracy: 7026/10000 (70%)

\Val set: Average loss: 130.1837, Accuracy: 7210/10000 (72%)

\Val set: Average loss: 140.9762, Accuracy: 6928/10000 (69%)

\Val set: Average loss: 131.5308, Accuracy: 7179/10000 (72%)

\Test set: Average loss: 119.7609, Accuracy: 7468/10000 (75%)

Test accuracy: 0.747


In [None]:
# Changing architecture
# Model choosen is the one with 2 layer finetuned
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
fc2 = nn.Linear(128, 64)
fc3 = nn.Linear(64, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    fc2,
    nn.ReLU(),
    fc3
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
# Changing fine tuning parameters
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.1}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.1}
fc_params = {'params': net.fc.parameters(), 'lr': 0.001}

optimizer = torch.optim.SGD([layer4_params, layer3_params, fc_params], momentum=0.9, weight_decay=1e-04)



In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 144.6357, Accuracy: 6859/10000 (69%)

\Val set: Average loss: 137.8746, Accuracy: 6914/10000 (69%)

\Val set: Average loss: 121.7115, Accuracy: 7315/10000 (73%)

\Val set: Average loss: 118.5386, Accuracy: 7343/10000 (73%)

\Val set: Average loss: 117.1844, Accuracy: 7461/10000 (75%)

\Val set: Average loss: 110.6038, Accuracy: 7575/10000 (76%)

\Val set: Average loss: 108.5815, Accuracy: 7651/10000 (77%)

\Val set: Average loss: 109.8145, Accuracy: 7581/10000 (76%)

\Val set: Average loss: 108.1900, Accuracy: 7675/10000 (77%)

\Val set: Average loss: 112.9115, Accuracy: 7547/10000 (75%)

\Test set: Average loss: 99.1942, Accuracy: 7825/10000 (78%)

Test accuracy: 0.782


In [None]:
# Model choosen is the one with 2 layer finetuned
# Try to add more and more layers to finetuning and check
net = models.resnet18(pretrained=True)
fc1 = nn.Linear(512, 128)

# Modify the existing fully connected layer (fc)
fc2 = nn.Linear(128, 64)
fc3 = nn.Linear(64, 32)
fc4 = nn.Linear(32, 10)

# Replace the model's classifier with a new sequential layer
# that includes the new fc1 and the modified fc
net.fc = nn.Sequential(
    fc1,
    nn.ReLU(),   # Optional: Add an activation function like ReLU
    fc2,
    nn.ReLU(),
    fc3,
    nn.ReLU(),
    fc4
)
net.to(device)

# Unfreeze layer4 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze layer3 parameters
for param in net.layer4.parameters():
    param.requires_grad = True

# Unfreeze fc layer parameters
net.fc.requires_grad = True

# Setting different learning rates
# Changing fine tuning parameters
layer4_params = {'params': net.layer4.parameters(), 'lr': 0.1}
layer3_params = {'params': net.layer3.parameters(), 'lr': 0.1}
fc_params = {'params': net.fc.parameters(), 'lr': 0.001}

optimizer = torch.optim.SGD([layer4_params, layer3_params, fc_params], momentum=0.9, weight_decay=1e-04)

In [None]:
_, _, _, _, test_acc = get_results(net, trainloader, optimizer, valloader, testloader, device)
print(f"Test accuracy: {test_acc:.3f}")

\Val set: Average loss: 156.2044, Accuracy: 6628/10000 (66%)

\Val set: Average loss: 130.9022, Accuracy: 7175/10000 (72%)

\Val set: Average loss: 131.2744, Accuracy: 7130/10000 (71%)

\Val set: Average loss: 122.2112, Accuracy: 7288/10000 (73%)

\Val set: Average loss: 118.3381, Accuracy: 7385/10000 (74%)

\Val set: Average loss: 116.2314, Accuracy: 7420/10000 (74%)

\Val set: Average loss: 113.3391, Accuracy: 7551/10000 (76%)

\Val set: Average loss: 115.1700, Accuracy: 7478/10000 (75%)

\Val set: Average loss: 108.3399, Accuracy: 7612/10000 (76%)

\Val set: Average loss: 109.9229, Accuracy: 7581/10000 (76%)

\Test set: Average loss: 97.5701, Accuracy: 7865/10000 (79%)

Test accuracy: 0.786


## Exercise 3

Try to implement the model selection strategy (also known as early stopping) based on the validation accuracy on cifar10.

Consider using the two following command to respectively save and load the state of all the parameters of the model in a moment.

In [None]:
# save all the parameters of the model
model_state_dict = net.state_dict()

# load saved weights on the model
net.load_state_dict(model_state_dict)

<All keys matched successfully>

In [None]:
def train_and_eval(model, device, train_loader, valloader, optimizer, epoch):
    model.train()
    losses = []
    val_losses = []
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 500 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
        losses.append(loss.item())

    model.eval()
    with torch.no_grad():
        val_corr = 0
        for batch_idx, (data, target) in enumerate(valloader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss_val = criterion(output, target)
            val_losses.append(loss_val.item())
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            val_corr += pred.eq(target.view_as(pred)).sum().item()

    val_accuracy = val_corr / 10000 # Validation set has 10000 elements
    return np.mean(losses), np.mean(val_losses), val_accuracy

In [None]:
net = models.resnet18()
net.fc = nn.Linear(512, 10)
net.to(device)

optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=1e-04)
best_accuracy = 0
best_model_state_dict = None
for epoch in range(55):
    train_loss, val_loss, val_acc = train_and_eval(net, device, trainloader, valloader, optimizer, epoch)
    print(f"Epoch {epoch + 1} mean train loss: {train_loss:.3f}")
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        best_model_state_dict = net.state_dict()
        print("New best model")

test_loss, test_acc = test(net, device, testloader)
print(f"Test accuracy: {test_acc:.3f}")

net.load_state_dict(best_model_state_dict)
test_loss, test_acc = test(net, device, testloader)
print(f"Test accuracy with early stopping: {test_acc:.3f}")

Epoch 1 mean train loss: 1.795
New best model
Epoch 2 mean train loss: 1.460
New best model
Epoch 3 mean train loss: 1.283
New best model
Epoch 4 mean train loss: 1.146
New best model
Epoch 5 mean train loss: 1.042
New best model
Epoch 6 mean train loss: 0.971
New best model
Epoch 7 mean train loss: 0.908
Epoch 8 mean train loss: 0.864
New best model
Epoch 9 mean train loss: 0.821
New best model
Epoch 10 mean train loss: 0.778
New best model
Epoch 11 mean train loss: 0.761
New best model
Epoch 12 mean train loss: 0.720
New best model
Epoch 13 mean train loss: 0.696
New best model
Epoch 14 mean train loss: 0.670
Epoch 15 mean train loss: 0.649
New best model
Epoch 16 mean train loss: 0.629
Epoch 17 mean train loss: 0.612
New best model
Epoch 18 mean train loss: 0.595
Epoch 19 mean train loss: 0.574
New best model
Epoch 20 mean train loss: 0.565
Epoch 21 mean train loss: 0.553
New best model
Epoch 22 mean train loss: 0.541
Epoch 23 mean train loss: 0.516
New best model
Epoch 24 mean trai

In [None]:
# Best Model with scheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau

transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.4913998, 0.48215738, 0.44653124), (0.24703224, 0.2434851, 0.26158783)),
                                transforms.RandomCrop(32, padding=4),
                                transforms.RandomHorizontalFlip()])

trainset, valset = torch.utils.data.random_split(dataset, [40000, 10000])

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2,
                                          drop_last=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                          shuffle=False, num_workers=2,
                                          drop_last=False)


class Net(nn.Module):
    def __init__(self, kernel_size=5, stride=1, padding=0):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size, stride, padding)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size, stride, padding)
        if kernel_size == 5:
            if padding == 2:
                self.fc1 = nn.Linear(1024, 120)
            elif stride == 2:
                self.fc1 = nn.Linear(16, 120)
            else:
                self.fc1 = nn.Linear(16 * 5 * 5, 120)
        elif kernel_size == 3:
            self.fc1 = nn.Linear(576, 120)
        elif kernel_size == 1:
            self.fc1 = nn.Linear(1024, 120)
        else: # kernel_size == 7
            if padding == 3:
                self.fc1 = nn.Linear(1024, 120)
            else:
                self.fc1 = nn.Linear(144, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
net = Net(kernel_size=5, padding=2).to(device)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-04)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=0)

best_accuracy = 0
best_model_state_dict = None
for epoch in range(30):
    train_loss, val_loss, val_acc = train_and_eval(net, device, trainloader, valloader, optimizer, epoch)
    print(f"Epoch {epoch + 1} mean train loss: {train_loss:.3f}")
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        best_model_state_dict = net.state_dict()
        print("New best model")

test_loss, test_acc = test(net, device, testloader)
print(f"Test accuracy: {test_acc:.3f}")

net.load_state_dict(best_model_state_dict)
test_loss, test_acc = test(net, device, testloader)
print(f"Test accuracy with early stopping: {test_acc:.3f}")