### In this notebook, we call the modified ResNet architecture (with SEBlock added) defined in SE_ResNet_68.py, and then train and test it with our best set of hyperparameters, training strategies and Ranger optimizer, thus get our best model (with the best test accuracy of 96.48% on CIFAR-10).

### In the process of exploring the best model, we experimented on different architectures, hyperparameters and training strategies (the results and plots of experiments are shown in plots_for_augmentation/gradient_clip/lr/optimizer/residual_layers.py). Here we only show the training process and final result of our best model.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.optim import Optimizer
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from pytorch_optimizer import Ranger
import numpy as np
import pandas as pd
from collections import defaultdict
from itertools import chain
from torchsummary import summary
from SE_ResNet_55 import ResNet55, BasicBlock
from SE_ResNet_68 import ResNet68, BasicBlock

### Implement the Lookahead optimizer for Ranger and further experiments:

In [3]:
# Reference: https://github.com/Nikunj-Gupta/Efficient_ResNets/blob/master/lookahead.py

'''
PyTorch implement of 'Lookahead Optimizer: k steps forward, 1 step back', arXiv:1907.08610
'''

class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0
    
    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)
    
    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)

    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss

    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }

    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state

    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

### The model definiton is in SE_ResNet_55.py and SE_ResNet_68.py (with 4 and 3 residual layers respectively), here we call the architecture and BasicBlock to test both architeture and different sets of hyperparameters. (The plots and results of this experiment are in plots_for_residual_layers.py)

### We found the best architure is the one with 3 residual layers, with N = 3, C = 64, Bi = [4, 4, 3], Fi = [3, 3, 3], Ki = [1, 1, 1], P = 8.

In [5]:
# # Define a 55 layers ResNet with 4 residual layers
# net = ResNet55(
#     block=BasicBlock, 
#     num_blocks=[2, 2, 2, 2],               # N: number of Residual Layers | Bi:Residual blocks in Residual Layer i 
#     conv_kernel_sizes=[3, 3, 3, 3],        # Fi: Conv. kernel size in Residual Layer i 
#     shortcut_kernel_sizes=[1, 1, 1, 1] ,   # Ki: Skip connection kernel size in Residual Layer i 
#     num_channels=[64, 128, 232, 268],      # Ci: # channels in Residual Layer i 
#     avg_pool_kernel_size=8,                # P: Average pool kernel size 
#     drop=0,                                # use dropout with drop proportion 
#     squeeze_and_excitation=1               # Enable/disable Squeeze-and-Excitation Block 
#     ) 

# Define a 68 layers ResNet with 3 residual layers
net = ResNet68(
    block=BasicBlock, 
    num_blocks=[4, 4, 3],                    # N: number of Residual Layers | Bi:Residual blocks in Residual Layer i 
    conv_kernel_sizes=[3, 3, 3],             # Fi: Conv. kernel size in Residual Layer i 
    shortcut_kernel_sizes=[1, 1, 1] ,        # Ki: Skip connection kernel size in Residual Layer i 
    num_channels=64,                         # Ci: # channels in Residual Layer i 
    avg_pool_kernel_size=8,                  # P: Average pool kernel size 
    drop=0,                                  # use dropout with drop proportion 
    squeeze_and_excitation=1                 # Enable/disable Squeeze-and-Excitation Block 
    ) 

device = 'cuda' if torch.cuda.is_available() else 'cpu'
net = net.to(device)

# Print the architeture and total number of params of the model
summary(net, input_size=(3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,728
       BatchNorm2d-2           [-1, 64, 32, 32]             128
 AdaptiveAvgPool2d-3             [-1, 64, 1, 1]               0
            Conv2d-4              [-1, 4, 1, 1]             260
              ReLU-5              [-1, 4, 1, 1]               0
            Conv2d-6             [-1, 64, 1, 1]             320
           Sigmoid-7             [-1, 64, 1, 1]               0
           SEBlock-8           [-1, 64, 32, 32]               0
            Conv2d-9           [-1, 64, 32, 32]          36,864
      BatchNorm2d-10           [-1, 64, 32, 32]             128
           Conv2d-11           [-1, 64, 32, 32]          36,864
      BatchNorm2d-12           [-1, 64, 32, 32]             128
       BasicBlock-13           [-1, 64, 32, 32]               0
           Conv2d-14           [-1, 64,

### From above model summary, we can find that the total number of parameters of our model is 4,697,742 (under the constraint of 5 million).

### For data preprocessing, we conducted data augmentation and data normalization on the dataset.
### (The plots and results of this experiment are in plots_for_augmentation.py)

In [None]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding = 4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(5),
    transforms.AutoAugment(transforms.AutoAugmentPolicy.CIFAR10),
    transforms.ToTensor(),
    transforms.Normalize((0.4913996458053589, 0.48215845227241516, 0.44653093814849854), (0.2470322549343109, 0.24348513782024384, 0.26158788800239563))
    ])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4913996458053589, 0.48215845227241516, 0.44653093814849854), (0.2470322549343109, 0.24348513782024384, 0.26158788800239563))
    ])

train_dataset = datasets.CIFAR10(root='./data', train=True,
                                 download=True, transform=transform_train)
test_dataset = datasets.CIFAR10(root='./data', train=False,
                                download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers = 0)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers = 0)

### We tried many different training strategies: For optimizer, we tried SGD, Adam, LookAhead and Ranger. For learning rate, we tried 0.1 and 0.01. (The plots and results of this experiment are in plots_for_optimizer/lr.py)

### Through experiments, we found that Ranger optimizer with lr = 0.1 provides the best performance.

In [None]:
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0005)
# optimizer = optim.Adam(net.parameters(), lr=0.1, weight_decay=0.0005)
# optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)
# optimizer = Lookahead(optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0005), k=5, alpha=0.5)
optimizer = Ranger(net.parameters(), lr=0.1, weight_decay=0.0005)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = 200)

### In the training process, we also tried to add Gradient Clipping to stablize it.  (The plots and results of this experiment are in plots_for_gradient_clip.py)

In [6]:
# Training
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    train_losses = [] 
    train_acc = []
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        nn.utils.clip_grad_value_(net.parameters(), clip_value=0.1)   # Gradient Clipping: we experimented on whether add this process or not
        optimizer.step()

        train_loss += loss.item()
        train_losses.append(train_loss)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item() 

        train_acc.append(100.*correct/total) 
        # print('Batch_idx: %d | Train Loss: %.3f | Train Acc: %.3f%% (%d/%d)'% (batch_idx, train_loss/(batch_idx+1), 100.*correct/total, correct, total)) 
    print('train_loss:', np.mean(train_losses)) 
    print('train_accuracy:', str(np.mean(train_acc)) + '%')
    train_loss_per_epoch.append(np.mean(train_losses))
    train_acc_per_epoch.append(np.mean(train_acc))
    
    
# Testing 
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    test_losses = [] 
    test_acc = [] 
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            test_losses.append(test_loss)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item() 
            test_acc.append(100.*correct/total) 
            # print('Batch_idx: %d | Test Loss: %.3f | Test Acc: %.3f%% (%d/%d)'% ( batch_idx, test_loss/(batch_idx+1), 100.*correct/total, correct, total)) 
        print('test_loss:', np.mean(test_losses)) 
        print('test_accuracy:', str(np.mean(test_acc)) + '%')
        test_loss_per_epoch.append(np.mean(test_losses))
        test_acc_per_epoch.append(np.mean(test_acc))
        
    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc: 
        print('Saving..')
        state = {
            'net': net.state_dict(),
            'acc': acc,
            'epoch': epoch
        }
        torch.save(state, './best.pth')
        best_acc = acc


In [7]:
best_acc = 0
train_loss_per_epoch = []
test_loss_per_epoch = []
train_acc_per_epoch = []
test_acc_per_epoch = []
for epoch in range(210): 
    train(epoch)
    test(epoch)
    scheduler.step()

# Save the train and test evaluation metrics per epoch
train_test_eval = pd.DataFrame({
    "Train Loss": train_loss_per_epoch,
    "Train Accuracy": train_acc_per_epoch,
    "Test Loss": test_loss_per_epoch,
    "Test Accuracy": test_acc_per_epoch
})
train_test_eval.to_csv('./train_test_eval.csv')

Epoch: 0
train_loss: 398.5120216037916
train_accuracy: 22.477906239512365
test_loss: 28.56259649693966
test_accuracy: 28.56259649693966


Epoch: 1
train_loss: 307.5882915112064
train_accuracy: 41.56876068997233
test_loss: 23.551323956251142
test_accuracy: 23.551323956251142


Epoch: 2
train_loss: 257.2686211510997
train_accuracy: 52.3812932607036
test_loss: 23.69196097403765
test_accuracy: 23.69196097403765


Epoch: 3
train_loss: 216.0446111706212
train_accuracy: 60.48357072980576
test_loss: 19.275652007758616
test_accuracy: 19.275652007758616


Epoch: 4
train_loss: 185.65784910892893
train_accuracy: 66.9213146067268
test_loss: 19.59159562140703
test_accuracy: 19.59159562140703


Epoch: 5
train_loss: 165.3997737126582
train_accuracy: 70.15169789837763
test_loss: 13.238767424225808
test_accuracy: 13.238767424225808


Epoch: 6
train_loss: 150.87508081612378
train_accuracy: 73.36553136812228
test_loss: 11.869544850289822
test_accuracy: 11.869544850289822


Epoch: 7
train_loss: 141.5630471

In [None]:
train_test_eval

Unnamed: 0,Train Loss,Train Accuracy,Test Loss,Test Accuracy
0,1619.177015,20.226219,132.875744,40.173329
1,1243.904046,41.361891,87.739549,59.356718
2,1008.477470,53.931819,112.362868,55.608871
3,863.560530,60.753004,64.298454,71.703157
4,780.544254,64.604683,140.398815,53.969890
...,...,...,...,...
205,64.926888,97.199416,9.828154,95.998920
206,63.866885,97.318172,9.817834,95.925415
207,66.104326,97.154664,9.927568,95.725125
208,65.536809,97.259325,9.769878,95.851526


### Finally, the final test accuracy of our modified ResNet is 96.48% after trained for 209 epochs (The model was saved after this epoch):

In [15]:
train_test_eval = pd.read_csv('train_test_eval.csv')
train_test_eval[['Train Loss', 'Train Accuracy', 'Test Loss', 'Test Accuracy']]

Unnamed: 0,Train Loss,Train Accuracy,Test Loss,Test Accuracy
0,398.512022,22.477906,28.562596,49.349745
1,307.588292,41.568761,23.551324,59.000088
2,257.268621,52.381293,23.691961,61.859415
3,216.044611,60.483571,19.275652,68.197768
4,185.657849,66.921315,19.591596,69.762117
...,...,...,...,...
205,11.540171,98.091743,2.539126,96.302337
206,11.500225,98.091228,2.525335,96.387583
207,11.408549,98.032325,2.521022,96.446644
208,11.523653,98.059881,2.514548,96.481879


In [16]:
train_test_eval.iloc[208]

Unnamed: 0        208.000000
Train Loss         11.523653
Train Accuracy     98.059881
Test Loss           2.514548
Test Accuracy      96.481879
Name: 208, dtype: float64