## Task 1 Fine-tuning a CNN Pretrained on ImageNet for Bird Recognition

## Prelimiaries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
import datetime
import os
import copy
from torch.utils.tensorboard import SummaryWriter

# Set hyperparameters
data_dir = 'CUB_200_2011'
num_classes = 200
batch_size = 64
num_epochs = 25
learning_rate = 0.001
momentum = 0.9
l2 = 0.01
patience = 5

## Data loading and preprocess

In [2]:
# Data Augmentation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Load data
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['train', 'val', 'test']}
dataloaders = {x: torch.utils.data.DataLoader(
    image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=8) for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Functions

### for fine-tuning

In [3]:
def train_fine_tuning(net, dataloaders, dataset_sizes, learning_rate, num_epochs, l2, patience, param_group=True):
    """
    Fine-tunes a pretrained model on a new dataset.
    
    Args:
        net (torch.nn.Module): The neural network to train.
        dataloaders (dict): A dictionary containing the training and validation dataloaders.
        dataset_sizes (dict): A dictionary containing the sizes of the training and validation datasets.
        learning_rate (float): The learning rate for the optimizer.
        num_epochs (int, optional): The number of epochs to train for.
        l2: Weight decay strength.
        patience: The patience number for early stopping.
        param_group (bool, optional): If True, use a higher learning rate for the final layer. Default is True.
    
    Returns:
        net (torch.nn.Module): The fine-tuned neural network.
    """
    log_dir = os.path.join("runs/CUB_bird_classification", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    writer = SummaryWriter(log_dir)
    loss_fn = nn.CrossEntropyLoss()
    epochs_no_improve = 0
    
    # Define the optimizer with parameter groups if param_group is True
    if param_group:
        params_1x = [param for name, param in net.named_parameters() if name not in ["fc.weight", "fc.bias"]]
        optimizer = optim.SGD([
            {'params': params_1x},
            {'params': net.fc.parameters(), 'lr': learning_rate * 10}
        ], lr=learning_rate, weight_decay=l2, momentum=momentum)
    else:
        optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=l2, momentum=momentum)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.05)
    best_model_wts = copy.deepcopy(net.state_dict())  # Save the best model weights
    best_acc = 0.0  # Initialize the best accuracy

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()  # Zero the parameter gradients

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)  # Forward pass
                    _, preds = torch.max(outputs, 1)  # Get predictions
                    loss = loss_fn(outputs, labels)  # Compute loss

                    if phase == 'train':
                        loss.backward()  # Backward pass
                        optimizer.step()  # Optimize the model

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # TensorBoard
            if phase == 'train':
                writer.add_scalar('Training loss', epoch_loss, epoch)
                writer.add_scalar('Training accuracy', epoch_acc, epoch)
            else:
                writer.add_scalar('Validation loss', epoch_loss, epoch)
                writer.add_scalar('Validation accuracy', epoch_acc, epoch)
            
            # Deep copy the model if the validation accuracy is the best seen so far
            if phase == 'val':
                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(net.state_dict())
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1

                # Early stopping
                if epochs_no_improve == patience:
                    print('Early stopping!')
                    net.load_state_dict(best_model_wts)
                    writer.close()
                    torch.save(net.state_dict(), 'CUB_best_weights_ft.pth')
                    return net
        
        # Step the scheduler
        scheduler.step()
    print(f'Best val Acc: {best_acc:.4f}')
    net.load_state_dict(best_model_wts)
    writer.close()
    torch.save(net.state_dict(), 'CUB_best_weights_ft.pth')
    return net

### for grid search

In [14]:
def train_fine_tuning_search(net, dataloaders, dataset_sizes, lr_small, lr_large, num_epochs, l2, param_group=True):
    """
    Function for grid search, verbose outputs omitted.
    """
    log_dir = os.path.join("runs/CUB_bird_search", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    writer = SummaryWriter(log_dir)
    loss_fn = nn.CrossEntropyLoss()
    
    # Define the optimizer with parameter groups if param_group is True
    if param_group:
        params_1x = [param for name, param in net.named_parameters() if name not in ["fc.weight", "fc.bias"]]
        optimizer = optim.SGD([
            {'params': params_1x},
            {'params': net.fc.parameters(), 'lr': lr_large}
        ], lr=lr_small, weight_decay=l2, momentum=momentum)
    else:
        optimizer = optim.SGD(net.parameters(), lr=lr_small, weight_decay=l2, momentum=momentum)

    best_model_wts = copy.deepcopy(net.state_dict())  # Save the best model weights
    best_acc = 0.0  # Initialize the best accuracy

    for epoch in range(num_epochs):
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()  # Zero the parameter gradients

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)  # Forward pass
                    _, preds = torch.max(outputs, 1)  # Get predictions
                    loss = loss_fn(outputs, labels)  # Compute loss

                    if phase == 'train':
                        loss.backward()  # Backward pass
                        optimizer.step()  # Optimize the model

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            if epoch == num_epochs - 1:
                final_loss = epoch_loss
                final_acc = epoch_acc
                print(f'Final {phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # TensorBoard
            if phase == 'train':
                writer.add_scalar('Training loss', epoch_loss, epoch)
                writer.add_scalar('Training accuracy', epoch_acc, epoch)
            else:
                writer.add_scalar('Validation loss', epoch_loss, epoch)
                writer.add_scalar('Validation accuracy', epoch_acc, epoch)
            
            # Deep copy the model if the validation accuracy is the best seen so far
            if phase == 'val':
                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_model_wts = copy.deepcopy(net.state_dict())
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1

    print(f'Best val Acc: {best_acc:.4f}')
    net.load_state_dict(best_model_wts)
    writer.close()
    return net, final_loss, final_acc, best_acc

### for training from scratch

In [9]:
def train_model_scratch(net, dataloaders, dataset_sizes, optimizer, num_epochs):
    """Function for training a model from scratch.
    """
    log_dir = os.path.join("runs/CUB_from_scratch",
                           datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    writer = SummaryWriter(log_dir)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)
    best_model_wts = copy.deepcopy(net.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = loss_fn(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            if phase == 'train':
                writer.add_scalar('training loss', epoch_loss, epoch)
                writer.add_scalar('training accuracy', epoch_acc, epoch)
            else:
                writer.add_scalar('validation loss', epoch_loss, epoch)
                writer.add_scalar('validation accuracy', epoch_acc, epoch)

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(net.state_dict())

        # Step the scheduler
        scheduler.step()

    print(f'Best val Acc: {best_acc:.4f}')
    net.load_state_dict(best_model_wts)
    torch.save(net.state_dict(), 'CUB_best_weights_scratch.pth')
    writer.close()
    return net

### for evaluation

In [5]:
def evaluate_model(model, dataloaders, criterion, phase='test'):
    model.eval()
    running_loss = 0.0
    running_corrects = 0

    for inputs, labels in dataloaders[phase]:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

    loss = running_loss / dataset_sizes[phase]
    acc = running_corrects.double() / dataset_sizes[phase]

    # print(f'{phase} Loss: {loss:.4f} Acc: {acc:.4f}')
    return loss, acc

## Grid search for hyperparameters

In [18]:
# Search for the best parameters
results = []

def search_param(lrs, epochs, l2_s):
    for lr in lrs:
        for epoch in epochs:
            for l2 in l2_s:
                print(f'----Training with lr={lr}, epochs={epoch}, l2={l2}----')
                net = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
                net.fc = nn.Linear(net.fc.in_features, 200)
                net = net.to(device)

                trained_net, final_loss, final_acc, best_acc = train_fine_tuning_search(
                    net, dataloaders, dataset_sizes, lr[0], lr[1], epoch, l2)

                test_loss, test_acc = evaluate_model(trained_net, dataloaders, nn.CrossEntropyLoss())

                results.append((lr, epoch, l2, final_loss, final_acc, test_loss, test_acc, best_acc))
                print(f'Test loss: {test_loss:.4f} Acc: {test_acc:.4f}\n')

In [19]:
# Search for lrs
# (small, large)
lrs = [(0.01, 0.1), (0.01, 0.01), (0.001, 0.01), (0.001, 0.001), (5e-4, 5e-3), 
       (1e-4, 1e-3), (1e-5, 1e-4), (5e-5, 5e-4)]
epoch = 15

search_param(lrs, [epoch], [0.01])

----Training with lr=(0.01, 0.1), epochs=15, l2=0.01----
Final train Loss: 2.1697 Acc: 0.5078
Final val Loss: 2.4523 Acc: 0.4178
Best val Acc: 0.4904
Test loss: 1.9993 Acc: 0.5074

----Training with lr=(0.01, 0.01), epochs=15, l2=0.01----
Final train Loss: 1.9356 Acc: 0.5969
Final val Loss: 2.1083 Acc: 0.4879
Best val Acc: 0.5596
Test loss: 1.7709 Acc: 0.5690

----Training with lr=(0.001, 0.01), epochs=15, l2=0.01----
Final train Loss: 1.2869 Acc: 0.7358
Final val Loss: 1.2314 Acc: 0.7048
Best val Acc: 0.7106
Test loss: 1.1907 Acc: 0.7164

----Training with lr=(0.001, 0.001), epochs=15, l2=0.01----
Final train Loss: 2.3232 Acc: 0.5994
Final val Loss: 2.0525 Acc: 0.5997
Best val Acc: 0.5997
Test loss: 1.9915 Acc: 0.6134

----Training with lr=(0.0005, 0.005), epochs=15, l2=0.01----
Final train Loss: 1.5252 Acc: 0.6970
Final val Loss: 1.4174 Acc: 0.6747
Best val Acc: 0.6747
Test loss: 1.3381 Acc: 0.6964

----Training with lr=(0.0001, 0.001), epochs=15, l2=0.01----
Final train Loss: 3.1223

In [23]:
# Search for epochs
lr = (0.001, 0.01)
epochs = [15, 25, 45, 65]
l2 = 0.01
search_param([lr], epochs, [l2])

----Training with lr=(0.001, 0.01), epochs=15, l2=0.01----
Final train Loss: 1.2756 Acc: 0.7426
Final val Loss: 1.2268 Acc: 0.7056
Best val Acc: 0.7056
Test loss: 1.1648 Acc: 0.7194

----Training with lr=(0.001, 0.01), epochs=25, l2=0.01----
Final train Loss: 1.0975 Acc: 0.8017
Final val Loss: 1.1870 Acc: 0.7248
Best val Acc: 0.7248
Test loss: 1.1144 Acc: 0.7347

----Training with lr=(0.001, 0.01), epochs=45, l2=0.01----
Final train Loss: 0.9513 Acc: 0.8367
Final val Loss: 1.1712 Acc: 0.7206
Best val Acc: 0.7381
Test loss: 1.1074 Acc: 0.7453

----Training with lr=(0.001, 0.01), epochs=65, l2=0.01----
Final train Loss: 0.8487 Acc: 0.8659
Final val Loss: 1.1642 Acc: 0.7323
Best val Acc: 0.7456
Test loss: 1.0980 Acc: 0.7503



In [21]:
# Search for l2 regularization strengths
l2_s = [0.1, 0.01, 0.001, 0.0001]
epoch = 25
lr = (0.001, 0.01)
search_param([lr], [epoch], l2_s)

----Training with lr=(0.001, 0.01), epochs=25, l2=0.1----
Final train Loss: 4.4820 Acc: 0.1007
Final val Loss: 4.3680 Acc: 0.1243
Best val Acc: 0.3770
Test loss: 3.2471 Acc: 0.3907

----Training with lr=(0.001, 0.01), epochs=25, l2=0.01----
Final train Loss: 1.1049 Acc: 0.7931
Final val Loss: 1.1633 Acc: 0.7189
Best val Acc: 0.7256
Test loss: 1.1167 Acc: 0.7301

----Training with lr=(0.001, 0.01), epochs=25, l2=0.001----
Final train Loss: 0.6861 Acc: 0.8482
Final val Loss: 1.0361 Acc: 0.7198
Best val Acc: 0.7248
Test loss: 0.9837 Acc: 0.7275

----Training with lr=(0.001, 0.01), epochs=25, l2=0.0001----
Final train Loss: 0.6500 Acc: 0.8515
Final val Loss: 1.0420 Acc: 0.7198
Best val Acc: 0.7206
Test loss: 0.9797 Acc: 0.7287



## Fine-tuning the model

In [56]:
# Load a pre-trained ResNet-18 model
model_ft = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
# Modify the fully connected layer to match the number of output classes
model_ft.fc = nn.Linear(model_ft.fc.in_features, num_classes)
# Initialize the weights of FC layer using Xavier uniform initialization
nn.init.xavier_uniform_(model_ft.fc.weight)
model_ft = model_ft.to(device)

# Train the model
learning_rate = 0.001
num_epochs = 50
model_ft = train_fine_tuning(model_ft, dataloaders, dataset_sizes, learning_rate, num_epochs, l2, patience)

Epoch 0/49
----------
train Loss: 5.5233 Acc: 0.0156
val Loss: 4.7857 Acc: 0.0601
Epoch 1/49
----------
train Loss: 4.3770 Acc: 0.1255
val Loss: 3.7077 Acc: 0.1977
Epoch 2/49
----------
train Loss: 3.5087 Acc: 0.2609
val Loss: 2.9357 Acc: 0.3403
Epoch 3/49
----------
train Loss: 2.8977 Acc: 0.3973
val Loss: 2.4918 Acc: 0.4170
Epoch 4/49
----------
train Loss: 2.4901 Acc: 0.4792
val Loss: 2.2088 Acc: 0.5071
Epoch 5/49
----------
train Loss: 2.2320 Acc: 0.5437
val Loss: 2.0041 Acc: 0.5388
Epoch 6/49
----------
train Loss: 2.0737 Acc: 0.5704
val Loss: 1.8437 Acc: 0.5680
Epoch 7/49
----------
train Loss: 1.9085 Acc: 0.6148
val Loss: 1.7391 Acc: 0.5955
Epoch 8/49
----------
train Loss: 1.8279 Acc: 0.6265
val Loss: 1.6664 Acc: 0.6155
Epoch 9/49
----------
train Loss: 1.7025 Acc: 0.6542
val Loss: 1.6126 Acc: 0.6163
Epoch 10/49
----------
train Loss: 1.6307 Acc: 0.6753
val Loss: 1.5453 Acc: 0.6480
Epoch 11/49
----------
train Loss: 1.5866 Acc: 0.6792
val Loss: 1.4966 Acc: 0.6497
Epoch 12/49
--

## Training from scratch

In [10]:
# Train a model from scratch
model_scratch = models.resnet18()  # no pre-trained weights used
model_scratch.fc = nn.Linear(model_scratch.fc.in_features, 200)
model_scratch = model_scratch.to(device)

optimizer = optim.SGD(model_scratch.parameters(), lr=0.01, momentum=0.9, weight_decay=0.001)


num_epochs = 100
model_scratch = train_model_scratch(model_scratch, dataloaders, dataset_sizes, optimizer, num_epochs)

Epoch 0/99
----------
train Loss: 5.3608 Acc: 0.0083
val Loss: 5.1666 Acc: 0.0200
Epoch 1/99
----------
train Loss: 5.0445 Acc: 0.0223
val Loss: 4.8936 Acc: 0.0267
Epoch 2/99
----------
train Loss: 4.7982 Acc: 0.0321
val Loss: 4.9673 Acc: 0.0267
Epoch 3/99
----------
train Loss: 4.6457 Acc: 0.0400
val Loss: 4.5881 Acc: 0.0475
Epoch 4/99
----------
train Loss: 4.5421 Acc: 0.0509
val Loss: 4.6949 Acc: 0.0475
Epoch 5/99
----------
train Loss: 4.4472 Acc: 0.0603
val Loss: 4.5775 Acc: 0.0550
Epoch 6/99
----------
train Loss: 4.3055 Acc: 0.0713
val Loss: 4.6730 Acc: 0.0559
Epoch 7/99
----------
train Loss: 4.2129 Acc: 0.0884
val Loss: 4.2846 Acc: 0.0751
Epoch 8/99
----------
train Loss: 4.1206 Acc: 0.0966
val Loss: 4.3907 Acc: 0.0801
Epoch 9/99
----------
train Loss: 4.0465 Acc: 0.1122
val Loss: 4.2490 Acc: 0.0801
Epoch 10/99
----------
train Loss: 3.9810 Acc: 0.1164
val Loss: 4.4095 Acc: 0.0826
Epoch 11/99
----------
train Loss: 3.8921 Acc: 0.1226
val Loss: 4.4009 Acc: 0.0934
Epoch 12/99
--

## Evaluation

In [11]:
# Load weights
model_ft_load = models.resnet18()
num_ftrs = model_ft_load.fc.in_features
model_ft_load.fc = nn.Linear(num_ftrs, 200)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ft_load = model_ft_load.to(device)

model_ft_load.load_state_dict(torch.load('CUB_best_weights_ft.pth'))

<All keys matched successfully>

In [12]:
criterion = nn.CrossEntropyLoss()
test_loss_ft, test_acc_ft = evaluate_model(
    model_ft_load, dataloaders, criterion, phase='test')
test_loss_scratch, test_acc_scratch = evaluate_model(
    model_scratch, dataloaders, criterion, phase='test')
print(
    f'Pre-trained Model - Test Loss: {test_loss_ft:.4f} Test Acc: {test_acc_ft:.4f}')
print(
    f'Scratch Model - Test Loss: {test_loss_scratch:.4f} Test Acc: {test_acc_scratch:.4f}')

Pre-trained Model - Test Loss: 1.1380 Test Acc: 0.7325
Scratch Model - Test Loss: 2.8258 Test Acc: 0.3243
