In [None]:
from torchvision.models import resnet18, ResNet18_Weights
import random
import copy
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
import torch
import torch.optim as optim
import torchvision
from torchvision.transforms import Resize
import torchvision.transforms as transforms
import torchvision.models as models
import torch.quantization
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import pandas as pd
import time


# use GPU if available
if torch.cuda.is_available():
        device = torch.device("cuda")
        print("GPU is available and being used.")
else:
        device = torch.device("cpu")
        print("GPU is not available, using CPU instead.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

GPU is available and being used.
Using device: cuda


In [None]:
# Hyperparameters:
learning_rate = 0.0005
momentum = 0.9
weight_decay = 1e-3

num_epochs = 61
T_max = num_epochs
eta_min = 1e-5


### Basic block and RestNet18 architecture

In [None]:
class BasicBlock(nn.Module):
    expansion = 1  # No expansion in BasicBlock

    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.stride = stride

        # First convolutional layer
        self.conv1 = nn.Conv2d(
            in_channels, out_channels,
            kernel_size=kernel_size, stride=stride, padding=padding, bias=False
        )

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        # Second convolutional layer
        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=kernel_size, stride=1, padding=padding, bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Downsample layer for shortcut connection (if needed)
        self.downsample = downsample

    def forward(self, x):
        identity = x  # Save the input tensor for the shortcut

        # First layer
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        # Second layer
        out = self.conv2(out)
        out = self.bn2(out)

        # Apply downsampling to the identity if necessary
        if self.downsample is not None:
            identity = self.downsample(x)

        # Add the identity (shortcut connection)
        out += identity
        out = self.relu(out)

        return out

In [None]:
class ResNet18(nn.Module):
    def __init__(self, num_classes=1000):
        super(ResNet18, self).__init__()

        # Initial Convolution and Max Pool
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=64,
            kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Define layers using your BasicBlock
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)


        # Adaptive Average Pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Fully connected layer
        self.fc = nn.Linear(512 * BasicBlock.expansion, num_classes)

        # Initialize weights
        self._initialize_weights()

    def _make_layer(self, in_channels, out_channels, blocks, stride):
        downsample = None
        if stride != 1 or in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

        layers = []
        layers.append(BasicBlock(in_channels, out_channels, stride=stride, downsample=downsample))
        for _ in range(1, blocks):
            layers.append(BasicBlock(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

### Retrain from pretrain model if needed

In [None]:
model = ResNet18(num_classes=100)  # Adjusted to CIFAR-100 classes

# Load pretrained weights
pretrained_dict = models.resnet18(pretrained=True).state_dict()
model_dict = model.state_dict()

# Filter out unnecessary keys, particularly focusing on the fully connected layer
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and model_dict[k].size() == v.size()}

# Overwrite entries in the existing state dict
model_dict.update(pretrained_dict)

# Load the new state dict
model.load_state_dict(model_dict)

# Freeze all layers first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the deeper layers and fully connected layer for fine-tuning
for param in model.layer4.parameters():
    param.requires_grad = True
for param in model.fc.parameters():
    param.requires_grad = True

# Verify which layers are frozen and which are not (optional, for verification)
for name, param in model.named_parameters():
    print(f"{name} is {'unfrozen' if param.requires_grad else 'frozen'}")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 176MB/s]


conv1.weight is frozen
bn1.weight is frozen
bn1.bias is frozen
layer1.0.conv1.weight is frozen
layer1.0.bn1.weight is frozen
layer1.0.bn1.bias is frozen
layer1.0.conv2.weight is frozen
layer1.0.bn2.weight is frozen
layer1.0.bn2.bias is frozen
layer1.1.conv1.weight is frozen
layer1.1.bn1.weight is frozen
layer1.1.bn1.bias is frozen
layer1.1.conv2.weight is frozen
layer1.1.bn2.weight is frozen
layer1.1.bn2.bias is frozen
layer2.0.conv1.weight is frozen
layer2.0.bn1.weight is frozen
layer2.0.bn1.bias is frozen
layer2.0.conv2.weight is frozen
layer2.0.bn2.weight is frozen
layer2.0.bn2.bias is frozen
layer2.0.downsample.0.weight is frozen
layer2.0.downsample.1.weight is frozen
layer2.0.downsample.1.bias is frozen
layer2.1.conv1.weight is frozen
layer2.1.bn1.weight is frozen
layer2.1.bn1.bias is frozen
layer2.1.conv2.weight is frozen
layer2.1.bn2.weight is frozen
layer2.1.bn2.bias is frozen
layer3.0.conv1.weight is frozen
layer3.0.bn1.weight is frozen
layer3.0.bn1.bias is frozen
layer3.0.con

In [None]:
### Retrain if needed
model.to(device)
model.eval()

start_epoch = 0

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)

# Verify the model
print("start epoch: ", start_epoch)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape, param.data.sum())

start epoch:  0
layer4.0.conv1.weight torch.Size([512, 256, 3, 3]) tensor(-1845.7561, device='cuda:0')
layer4.0.bn1.weight torch.Size([512]) tensor(135.3306, device='cuda:0')
layer4.0.bn1.bias torch.Size([512]) tensor(-115.5699, device='cuda:0')
layer4.0.conv2.weight torch.Size([512, 512, 3, 3]) tensor(-3073.9663, device='cuda:0')
layer4.0.bn2.weight torch.Size([512]) tensor(217.2513, device='cuda:0')
layer4.0.bn2.bias torch.Size([512]) tensor(-101.1886, device='cuda:0')
layer4.0.downsample.0.weight torch.Size([512, 256, 1, 1]) tensor(-110.5000, device='cuda:0')
layer4.0.downsample.1.weight torch.Size([512]) tensor(128.3286, device='cuda:0')
layer4.0.downsample.1.bias torch.Size([512]) tensor(-101.1886, device='cuda:0')
layer4.1.conv1.weight torch.Size([512, 512, 3, 3]) tensor(-5334.5947, device='cuda:0')
layer4.1.bn1.weight torch.Size([512]) tensor(147.7632, device='cuda:0')
layer4.1.bn1.bias torch.Size([512]) tensor(-123.7711, device='cuda:0')
layer4.1.conv2.weight torch.Size([512, 5

### Train

In [None]:
# Define transformations for CIFAR-100 dataset
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Download the CIFAR-100 training dataset
download_train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
download_test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)

batch_size = 64
# Create DataLoader for training and validation datasets
train_loader = DataLoader(download_train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(download_test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169M/169M [00:18<00:00, 9.00MB/s]


Extracting ./data/cifar-100-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def load_checkpoint(model, optimizer, path):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    model.to(device)

    for state in optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.to(device)
    return model, optimizer, epoch



# Example usage before resuming training
checkpoint_path = '/content/drive/My Drive/Colab Notebooks/checkpoints/transfer_learning_checkpoint.pth'
model = ResNet18(num_classes=100)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

model, optimizer, start_epoch = load_checkpoint(model, optimizer, checkpoint_path)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()

# Create model, schedueler
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)

# Freeze all layers first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the deeper layers and fully connected layer for fine-tuning
for param in model.layer3.parameters():
    param.requires_grad = True
for param in model.layer4.parameters():
    param.requires_grad = True
for param in model.fc.parameters():
    param.requires_grad = True

# Verify the model
print("start epoch: ", start_epoch)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape, param.data.sum())

  checkpoint = torch.load(path, map_location=device)


start epoch:  54
layer3.0.conv1.weight torch.Size([256, 128, 3, 3]) tensor(-221.4406, device='cuda:0')
layer3.0.bn1.weight torch.Size([256]) tensor(69.2772, device='cuda:0')
layer3.0.bn1.bias torch.Size([256]) tensor(-27.6587, device='cuda:0')
layer3.0.conv2.weight torch.Size([256, 256, 3, 3]) tensor(-243.0699, device='cuda:0')
layer3.0.bn2.weight torch.Size([256]) tensor(72.6982, device='cuda:0')
layer3.0.bn2.bias torch.Size([256]) tensor(-8.6190, device='cuda:0')
layer3.0.downsample.0.weight torch.Size([256, 128, 1, 1]) tensor(-22.2722, device='cuda:0')
layer3.0.downsample.1.weight torch.Size([256]) tensor(18.9305, device='cuda:0')
layer3.0.downsample.1.bias torch.Size([256]) tensor(-8.6190, device='cuda:0')
layer3.1.conv1.weight torch.Size([256, 256, 3, 3]) tensor(-513.3556, device='cuda:0')
layer3.1.bn1.weight torch.Size([256]) tensor(61.9338, device='cuda:0')
layer3.1.bn1.bias torch.Size([256]) tensor(-55.8490, device='cuda:0')
layer3.1.conv2.weight torch.Size([256, 256, 3, 3]) te

In [None]:
def save_checkpoint(model, optimizer, epoch, path):
    # Create the directory if it doesn't exist
    import os
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

In [None]:
def evaluate(model, data_loader, device):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)  # Get predictions
            _, predicted = torch.max(outputs.data, 1)  # Get predicted class labels

            total += labels.size(0)  # Update total number of samples
            correct += (predicted == labels).sum().item()  # Update number of correct predictions

    accuracy = 100 * correct / total  # Calculate accuracy
    return accuracy

In [None]:
# Initializing parameters with zeroes
total_train = torch.zeros(num_epochs)
correct_train = torch.zeros(num_epochs)
avg_loss_train = torch.zeros(num_epochs)
accuracy_train = torch.zeros(num_epochs)

# TRAINING LOOP
print("START TRAINING........")
train_losses = [] # store training loss for each batch
train_accuracies = [] # store training accuracy for each batch
test_accuracies = [] #store test accuracy after each epoch

for epoch in range(start_epoch, num_epochs):
  model.train() # Set the model to training mode
  batch_losses = []
  batch_accuracies = []

  for input, target in train_loader:
      input, target = input.to(device), target.to(device)

      # forward
      output = model(input)
      loss = criterion(output, target)

      # backward
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # *** Add gradient clipping here ***
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)


      # save data
      batch_losses.append(loss.item())
      _, predicted = output.max(1)
      total = target.size(0)
      correct = predicted.eq(target).sum().item()
      batch_accuracies.append(100. * correct / total)

  train_losses.append(batch_losses) # append the batch losses for this epoch to the main list
  train_accuracies.append(batch_accuracies) # append the batch accuracies for this epoch to the main list
  avg_loss_train[epoch] = np.mean(batch_losses) # calculate and store average loss for the epoch
  accuracy_train[epoch] = np.mean(batch_accuracies) # calculate and store average accuracy for the epoch

  #Validation after each epoch
  test_accuracy = evaluate(model, test_loader, device)
  test_accuracies.append(test_accuracy)

  checkpoint_path = '/content/drive/My Drive/Colab Notebooks/checkpoints/transfer_learning_checkpoint.pth'
  if (epoch + 1) % 2 == 1:
        save_checkpoint(model, optimizer, epoch, checkpoint_path)
  print(f"Epoch [{epoch+1}/{num_epochs}] - "
        f"Train Loss: {avg_loss_train[epoch]:.4f} - "
        f"Train Accuracy: {accuracy_train[epoch]:.2f}% - "
        f"Validation Accuracy: {test_accuracy:.2f}% "
        )

  scheduler.step()


START TRAINING........
Epoch [55/61] - Train Loss: 0.2630 - Train Accuracy: 93.06% - Validation Accuracy: 74.24% 
Epoch [56/61] - Train Loss: 0.2616 - Train Accuracy: 93.08% - Validation Accuracy: 74.06% 
Epoch [57/61] - Train Loss: 0.2516 - Train Accuracy: 93.44% - Validation Accuracy: 74.42% 
Epoch [58/61] - Train Loss: 0.2499 - Train Accuracy: 93.54% - Validation Accuracy: 74.68% 
Epoch [59/61] - Train Loss: 0.2421 - Train Accuracy: 93.78% - Validation Accuracy: 74.78% 
Epoch [60/61] - Train Loss: 0.2308 - Train Accuracy: 94.10% - Validation Accuracy: 74.28% 
Epoch [61/61] - Train Loss: 0.2338 - Train Accuracy: 93.97% - Validation Accuracy: 74.28% 
