In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from torchvision.models import resnet18, ResNet18_Weights
import random
import copy
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
import torch
import torch.optim as optim
import torchvision
from torchvision.transforms import Resize
import torchvision.transforms as transforms
import torchvision.models as models
import torch.quantization
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import pandas as pd
import time


# use GPU if available
if torch.cuda.is_available():
        device = torch.device("cuda")
        print("GPU is available and being used.")
else:
        device = torch.device("cpu")
        print("GPU is not available, using CPU instead.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

GPU is available and being used.
Using device: cuda


In [None]:
# Hyperparameters:
learning_rate = 0.001
momentum = 0.9
weight_decay = 1e-3

num_epochs = 11
T_max = num_epochs
eta_min = 1e-5


In [None]:
# Define transformations for CIFAR-100 dataset
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Download the CIFAR-100 training dataset
download_train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
download_test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)

batch_size = 64
# Create DataLoader for training and validation datasets
train_loader = DataLoader(download_train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(download_test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:13<00:00, 13.0MB/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified


### Basic block and RestNet18 architecture

In [None]:
class BasicBlock(nn.Module):
    expansion = 1  # No expansion in BasicBlock

    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.stride = stride

        # First convolutional layer
        self.conv1 = nn.Conv2d(
            in_channels, out_channels,
            kernel_size=kernel_size, stride=stride, padding=padding, bias=False
        )

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        # Second convolutional layer
        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=kernel_size, stride=1, padding=padding, bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Downsample layer for shortcut connection (if needed)
        self.downsample = downsample

    def forward(self, x):
        identity = x  # Save the input tensor for the shortcut

        # First layer
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        # Second layer
        out = self.conv2(out)
        out = self.bn2(out)

        # Apply downsampling to the identity if necessary
        if self.downsample is not None:
            identity = self.downsample(x)

        # Add the identity (shortcut connection)
        out += identity
        out = self.relu(out)

        return out

In [None]:
class ResNet18(nn.Module):
    def __init__(self, num_classes=1000):
        super(ResNet18, self).__init__()

        # Initial Convolution and Max Pool
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=64,
            kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Define layers using your BasicBlock
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)


        # Adaptive Average Pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Fully connected layer
        self.fc = nn.Linear(512 * BasicBlock.expansion, num_classes)

        # Initialize weights
        self._initialize_weights()

    def _make_layer(self, in_channels, out_channels, blocks, stride):
        downsample = None
        if stride != 1 or in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

        layers = []
        layers.append(BasicBlock(in_channels, out_channels, stride=stride, downsample=downsample))
        for _ in range(1, blocks):
            layers.append(BasicBlock(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

### Functions

In [None]:
def load_checkpoint(model, optimizer, path):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    model.to(device)

    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to(device)
    return model, optimizer, epoch


In [None]:
def save_checkpoint(model, optimizer, epoch, path):
    # Create the directory if it doesn't exist
    import os
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

In [None]:
def evaluate(model, data_loader, device):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)  # Get predictions
            _, predicted = torch.max(outputs.data, 1)  # Get predicted class labels

            total += labels.size(0)  # Update total number of samples
            correct += (predicted == labels).sum().item()  # Update number of correct predictions

    accuracy = 100 * correct / total  # Calculate accuracy
    return accuracy

### Option 1: Reload from pretrain model, fit to our model

In [None]:
model = ResNet18(num_classes=100)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape, param.data.sum())

conv1.weight torch.Size([64, 3, 7, 7]) tensor(2.8040)
bn1.weight torch.Size([64]) tensor(64.)
bn1.bias torch.Size([64]) tensor(0.)
layer1.0.conv1.weight torch.Size([64, 64, 3, 3]) tensor(10.2015)
layer1.0.bn1.weight torch.Size([64]) tensor(64.)
layer1.0.bn1.bias torch.Size([64]) tensor(0.)
layer1.0.conv2.weight torch.Size([64, 64, 3, 3]) tensor(-1.0695)
layer1.0.bn2.weight torch.Size([64]) tensor(64.)
layer1.0.bn2.bias torch.Size([64]) tensor(0.)
layer1.1.conv1.weight torch.Size([64, 64, 3, 3]) tensor(-7.3693)
layer1.1.bn1.weight torch.Size([64]) tensor(64.)
layer1.1.bn1.bias torch.Size([64]) tensor(0.)
layer1.1.conv2.weight torch.Size([64, 64, 3, 3]) tensor(-6.4864)
layer1.1.bn2.weight torch.Size([64]) tensor(64.)
layer1.1.bn2.bias torch.Size([64]) tensor(0.)
layer2.0.conv1.weight torch.Size([128, 64, 3, 3]) tensor(-21.4533)
layer2.0.bn1.weight torch.Size([128]) tensor(128.)
layer2.0.bn1.bias torch.Size([128]) tensor(0.)
layer2.0.conv2.weight torch.Size([128, 128, 3, 3]) tensor(16.451

In [None]:
# Load the checkpoint
path = '/content/drive/My Drive/Colab Notebooks/checkpoints/pretrain_resnet18.pth'
checkpoint = torch.load(path, map_location=device)
pretrained_dict = checkpoint['model_state_dict']  # Load pretrained weights
model_dict = model.state_dict()

# Filter out unnecessary keys from the pretrained state dict
filtered_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and model_dict[k].size() == v.size()}

# Update the current model's state dict with the filtered state dict
model_dict.update(filtered_dict)

# Load the updated state dict back into the model
model.load_state_dict(model_dict)

start_epoch = 0
model.to(device)
model.eval()

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)


print("start epoch: ", start_epoch)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape, param.data.sum())

  checkpoint = torch.load(path, map_location=device)


start epoch:  0
conv1.weight torch.Size([64, 3, 7, 7]) tensor(-0.0831, device='cuda:0')
bn1.weight torch.Size([64]) tensor(15.1651, device='cuda:0')
bn1.bias torch.Size([64]) tensor(10.5746, device='cuda:0')
layer1.0.conv1.weight torch.Size([64, 64, 3, 3]) tensor(-252.3925, device='cuda:0')
layer1.0.bn1.weight torch.Size([64]) tensor(20.1336, device='cuda:0')
layer1.0.bn1.bias torch.Size([64]) tensor(-1.2524, device='cuda:0')
layer1.0.conv2.weight torch.Size([64, 64, 3, 3]) tensor(-79.8547, device='cuda:0')
layer1.0.bn2.weight torch.Size([64]) tensor(18.7809, device='cuda:0')
layer1.0.bn2.bias torch.Size([64]) tensor(-1.7764, device='cuda:0')
layer1.1.conv1.weight torch.Size([64, 64, 3, 3]) tensor(-170.1751, device='cuda:0')
layer1.1.bn1.weight torch.Size([64]) tensor(19.5954, device='cuda:0')
layer1.1.bn1.bias torch.Size([64]) tensor(-5.7394, device='cuda:0')
layer1.1.conv2.weight torch.Size([64, 64, 3, 3]) tensor(-129.5225, device='cuda:0')
layer1.1.bn2.weight torch.Size([64]) tensor

In [None]:
# Identify and print unused and uninitialized parameters
unused_keys = [k for k in pretrained_dict if k not in filtered_dict]
print("Unused keys from the checkpoint:", unused_keys)

uninitialized_keys = [k for k in model_dict if k not in filtered_dict]
print("Keys in the new model not initialized from checkpoint:", uninitialized_keys)

Unused keys from the checkpoint: []
Keys in the new model not initialized from checkpoint: []


### Option 2: Keep training with our own model checkpoint

In [None]:
# Example usage before resuming training
checkpoint_path = '/content/drive/My Drive/Colab Notebooks/checkpoints/transfer_learning_checkpoint.pth'
model = ResNet18(num_classes=100)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

model, optimizer, start_epoch = load_checkpoint(model, optimizer, checkpoint_path)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()

# Create model, schedueler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)

# # Freeze all layers first
# for param in model.parameters():
#     param.requires_grad = False

# # Unfreeze the deeper layers and fully connected layer for fine-tuning
# for param in model.layer3.parameters():
#     param.requires_grad = True
# for param in model.layer4.parameters():
#     param.requires_grad = True
# for param in model.fc.parameters():
#     param.requires_grad = True

# Verify the model
print("start epoch: ", start_epoch)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape, param.data.sum())

  checkpoint = torch.load(path, map_location=device)


start epoch:  4
conv1.weight torch.Size([64, 3, 7, 7]) tensor(5.1260, device='cuda:0')
bn1.weight torch.Size([64]) tensor(12.0971, device='cuda:0')
bn1.bias torch.Size([64]) tensor(6.9236, device='cuda:0')
layer1.0.conv1.weight torch.Size([64, 64, 3, 3]) tensor(-83.6823, device='cuda:0')
layer1.0.bn1.weight torch.Size([64]) tensor(15.5185, device='cuda:0')
layer1.0.bn1.bias torch.Size([64]) tensor(-2.6118, device='cuda:0')
layer1.0.conv2.weight torch.Size([64, 64, 3, 3]) tensor(-11.2146, device='cuda:0')
layer1.0.bn2.weight torch.Size([64]) tensor(14.6549, device='cuda:0')
layer1.0.bn2.bias torch.Size([64]) tensor(-3.4231, device='cuda:0')
layer1.1.conv1.weight torch.Size([64, 64, 3, 3]) tensor(-41.3265, device='cuda:0')
layer1.1.bn1.weight torch.Size([64]) tensor(15.5958, device='cuda:0')
layer1.1.bn1.bias torch.Size([64]) tensor(-5.0180, device='cuda:0')
layer1.1.conv2.weight torch.Size([64, 64, 3, 3]) tensor(-53.1163, device='cuda:0')
layer1.1.bn2.weight torch.Size([64]) tensor(19.4

In [None]:
# Initializing parameters with zeroes
total_train = torch.zeros(num_epochs)
correct_train = torch.zeros(num_epochs)
avg_loss_train = torch.zeros(num_epochs)
accuracy_train = torch.zeros(num_epochs)

# TRAINING LOOP
print("START TRAINING........")
train_losses = [] # store training loss for each batch
train_accuracies = [] # store training accuracy for each batch
test_accuracies = [] #store test accuracy after each epoch

for epoch in range(start_epoch, num_epochs):
  model.train() # Set the model to training mode
  batch_losses = []
  batch_accuracies = []

  for input, target in train_loader:
      input, target = input.to(device), target.to(device)

      # forward
      output = model(input)
      loss = criterion(output, target)

      # backward
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # *** Add gradient clipping here ***
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)


      # save data
      batch_losses.append(loss.item())
      _, predicted = output.max(1)
      total = target.size(0)
      correct = predicted.eq(target).sum().item()
      batch_accuracies.append(100. * correct / total)

  train_losses.append(batch_losses) # append the batch losses for this epoch to the main list
  train_accuracies.append(batch_accuracies) # append the batch accuracies for this epoch to the main list
  avg_loss_train[epoch] = np.mean(batch_losses) # calculate and store average loss for the epoch
  accuracy_train[epoch] = np.mean(batch_accuracies) # calculate and store average accuracy for the epoch

  #Validation after each epoch
  test_accuracy = evaluate(model, test_loader, device)
  test_accuracies.append(test_accuracy)

  checkpoint_path = '/content/drive/My Drive/Colab Notebooks/checkpoints/transfer_learning_checkpoint.pth'
  if (epoch + 1) % 2 == 1:
        save_checkpoint(model, optimizer, epoch, checkpoint_path)
  print(f"Epoch [{epoch+1}/{num_epochs}] - "
        f"Train Loss: {avg_loss_train[epoch]:.4f} - "
        f"Train Accuracy: {accuracy_train[epoch]:.2f}% - "
        f"Validation Accuracy: {test_accuracy:.2f}% "
        )

  scheduler.step()


START TRAINING........
Epoch [5/11] - Train Loss: 0.4893 - Train Accuracy: 86.29% - Validation Accuracy: 76.15% 
Epoch [6/11] - Train Loss: 0.4291 - Train Accuracy: 88.26% - Validation Accuracy: 76.33% 
Epoch [7/11] - Train Loss: 0.3749 - Train Accuracy: 89.93% - Validation Accuracy: 76.52% 
Epoch [8/11] - Train Loss: 0.3272 - Train Accuracy: 91.53% - Validation Accuracy: 76.94% 
Epoch [9/11] - Train Loss: 0.2703 - Train Accuracy: 93.32% - Validation Accuracy: 77.22% 
Epoch [10/11] - Train Loss: 0.2273 - Train Accuracy: 94.77% - Validation Accuracy: 77.38% 
Epoch [11/11] - Train Loss: 0.1875 - Train Accuracy: 95.89% - Validation Accuracy: 77.45% 
