The goal of this project is to train a ResNet-18 model on the CIFAR
100 dataset, apply quantization-aware training (QAT), and deploy it on a
 Kria KV260 FPGA board.

 While ResNet-18 was initially trained on the
 ImageNet-1k dataset, for this assignment, you can either
 * (1) train it from scratchon CIFAR-100 (less recommended) or
 * (2) use ImageNet pre-trained weights to initialize the model for training on CIFAR-100

# Part 1: Structure


The architecture components are listed as follows:
* Convolution(16, 3, 1, 1)
* Batch Normalization (BN)
* ReLU Activation
* BasicBlock(16, 3, 1, 1) × 3
* BasicBlock(32, 3, 2, 1)
* BasicBlock(32, 3, 1, 1) × 2
* BasicBlock(64, 3, 2, 1)
* BasicBlock(64, 3, 1, 1) × 2
* Average Pooling 2D (8, 1, 0)
* Linear Layer (64, 10)


### Setup

In [None]:
import random
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
import torch
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import pandas as pd
import time
!pip install ptflops
from ptflops import get_model_complexity_info

Collecting ptflops
  Downloading ptflops-0.7.4-py3-none-any.whl.metadata (9.4 kB)
Downloading ptflops-0.7.4-py3-none-any.whl (19 kB)
Installing collected packages: ptflops
Successfully installed ptflops-0.7.4


### Basic Block

In [None]:
class BasicBlock(nn.Module):
    expansion = 1  # No expansion in BasicBlock

    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.stride = stride

        # First convolutional layer
        self.conv1 = nn.Conv2d(
            in_channels, out_channels,
            kernel_size=kernel_size, stride=stride, padding=padding, bias=False
        )

        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        # Second convolutional layer
        self.conv2 = nn.Conv2d(
            out_channels, out_channels,
            kernel_size=kernel_size, stride=1, padding=padding, bias=False
        )
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Downsample layer for shortcut connection (if needed)
        self.downsample = downsample

    def forward(self, x):
        identity = x  # Save the input tensor for the shortcut

        # First layer
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        # Second layer
        out = self.conv2(out)
        out = self.bn2(out)

        # Apply downsampling to the identity if necessary
        if self.downsample is not None:
            identity = self.downsample(x)

        # Add the identity (shortcut connection)
        out += identity
        out = self.relu(out)

        return out

### ResNet18

In [None]:
class ResNet18(nn.Module):
    def __init__(self, num_classes=1000):
        super(ResNet18, self).__init__()

        # Initial Convolution and Max Pool
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Define layers using your BasicBlock
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)


        # Adaptive Average Pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Fully connected layer
        self.fc = nn.Linear(512 * BasicBlock.expansion, num_classes)

        # Initialize weights
        self._initialize_weights()

    def _make_layer(self, in_channels, out_channels, blocks, stride):
        downsample = None
        if stride != 1 or in_channels != out_channels:
            downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

        layers = []
        layers.append(BasicBlock(in_channels, out_channels, stride=stride, downsample=downsample))
        for _ in range(1, blocks):
            layers.append(BasicBlock(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

# Part 2: Check Usage

Calculate the number of trainable parameters, the minimum required onchip memory (in MB), the number of FLOPs (Floating Point Operations),
and the model’s latency on both GPU and CPU.

## Calculate the number of trainable parameters

In [None]:
def count_parameters(model):
    # total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Total Trainable Parameters: {total_params}")
    return total_params

model = ResNet18()
count_parameters(model)

Total Trainable Parameters: 11689512


11689512

## Calculate the minimum required onchip memory (in MB)

In [None]:
# Model size in MB
def get_model_size(model):
    import torch
    param_size = 0

    for param in model.parameters():
        param_size += param.nelement()  # param.element_size() after quantization will be 8-bit = 1 Byte
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement()  # buffer.element_size() after quantization will be 8-bit = 1 Byte
    # model_size = (param_size + buffer_size) / 1024 ** 2  # Convert to MB
    model_size = (param_size + buffer_size) / 1000 ** 2  # Convert to MB
    print(f"Model Size: {model_size:.2f} MB")
    return model_size

model = ResNet18()
get_model_size(model)

Model Size: 11.70 MB


11.699132

## Calculate the number of Multiply-Accumulate operations (MACs)

In [None]:
def compute_flops(model, input_res=32):
    macs, params = get_model_complexity_info(
        model, (3, input_res, input_res), as_strings=True,
        print_per_layer_stat=False, verbose=False
    )
    print(f"Computational Complexity (MACs): {macs}")
    print(f"Number of Parameters: {params}")

model = ResNet18()
compute_flops(model)

Computational Complexity (MACs): 37.75 MMac
Number of Parameters: 11.69 M


## Calculate the model’s latency on both GPU and CPU

In [None]:
def measure_inference_time(model, device='cpu', input_size=(1, 3, 32, 32), num_runs=100):
    model.eval()
    model.to(device)   # IMPORTANT
    input_tensor = torch.randn(input_size).to(device) # IMPORTANT

    # Warm-up runs
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_tensor)

    # Timing runs
    timings = []
    with torch.no_grad():
        for _ in range(num_runs):
            start_time = time.time()
            _ = model(input_tensor)
            end_time = time.time()
            timings.append(end_time - start_time)

    avg_time_per_run = sum(timings) / num_runs
    throughput = input_size[0] / avg_time_per_run

    print(f"Average Inference Time: {avg_time_per_run * 1000:.2f} ms")
    print(f"Throughput: {throughput:.2f} samples/sec")

model = ResNet18()
print(f"Latency on GPU")
measure_inference_time(model, device='cuda', input_size=(1, 3, 224, 224))
print(f"Latency on CPU")
measure_inference_time(model, device='cpu', input_size=(1, 3, 224, 224))

Latency on GPU
Average Inference Time: 3.21 ms
Throughput: 311.79 samples/sec
Latency on CPU
Average Inference Time: 114.10 ms
Throughput: 8.76 samples/sec


## BottleNeck Check for each components

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

model = model.eval()
inputs = torch.randn(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
print(type(model))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         2.11%       6.452ms       100.00%     306.329ms     306.329ms           0 b    -106.26 Mb             1  
                     aten::conv2d         0.08%     245.608us        74.59%     228.503ms      11.425ms      47.37 Mb           0 b            20  
                aten::convolution         0.12%     369.278us        74.51%     228.257ms      11.413ms      47.37 Mb           0 b            20  
               aten::_convolution         0.08%     243.013us        74.39%     227.888ms      11.394ms      47.