In [17]:
conda install cudatoolkit=11.8 -c nvidia

Channels:
 - nvidia
 - conda-forge
 - pytorch
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 23.11.0
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [None]:
# -----------------------------------------------------------
#  Neural Networks and Deep Learning Coursework - CIFAR-10 classification
#  Student Name: Ka Yi Cheng
#  Student id: 220566472
#  Date: 11th April, 2025
#  Reference Taken:
#  JupyterHub/ECS659P-ECS7026P/ResNet_Answer.ipynb
# -----------------------------------------------------------

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
from torch.utils.checkpoint import checkpoint

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device}.')
import torch.nn as nn
import torch.nn.functional as F

Using cuda.


In [3]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
import torchvision
def load_data_cifar_10(batch_size, resize=None):
    """Download the cifar_10 dataset and then load it into memory."""
    trans = [torchvision.transforms.ToTensor(),
             torchvision.transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                                  std=(0.2023, 0.1994, 0.2010))
            ]
    if resize:
        trans.insert(0, torchvision.transforms.Resize(resize))
    trans = torchvision.transforms.Compose(trans)

    train_dataset = torchvision.datasets.CIFAR10(root='./cifar-10_data', train=True, download=True, transform=trans)
    test_dataset = torchvision.datasets.CIFAR10(root='./cifar-10_data', train=False, download=True, transform=trans)

    return (torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=2, pin_memory=True), # Using pinned memory
            torch.utils.data.DataLoader(test_dataset, batch_size, shuffle=False, num_workers=2, pin_memory=True)) # Using pinned memory

In [5]:
batch_size = 128 # Defines the batch size
train_iter, test_iter = load_data_cifar_10(batch_size, resize=(32, 32)) # Loads the CIFAR-10 dataset. `train_iter` and `test_iter` are `DataLoader` objects.

Files already downloaded and verified
Files already downloaded and verified


In [6]:
X, y = next(iter(train_iter)) # Requests the first training batch
print(X.size()) 

torch.Size([128, 3, 32, 32])
torch.Size([128])


In [7]:
# Stem is the model's entry point, transforming a raw image (e.g., 32x32x3) into a richer feature map.
class Stem(nn.Module):
    def __init__(self, in_channels=3, out_channels=64):
        super(Stem, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )

    def forward(self, x):
        return self.conv(x)

In [8]:
# the Block flows: Spatial average pooling → FC1 → Activation → FC2 → Softmax
# and then, K convolutional branches weighted by output from expert branch.
class Block(nn.Module):
    def __init__(self, in_channels, out_channels, K=4, r=4):
        super(Block, self).__init__()
        self.K = K
        self.conv_layers = nn.ModuleList([
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
            for _ in range(K)
        ])

        self.fc1 = nn.Linear(in_channels, in_channels // r)
        self.fc2 = nn.Linear(in_channels // r, K)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        batch_size, C, H, W = x.size()

        # Expert branch
        pooled = F.adaptive_avg_pool2d(x, 1).view(batch_size, C)
        a = self.fc1(pooled)
        a = F.relu(a)
        a = self.fc2(a)
        a = self.softmax(a)  # shape: [B, K]

        # Convolutional branch
        outputs = [conv(x) for conv in self.conv_layers]  # list of tensors [B, C, H, W]
        output = sum(a[:, i].view(batch_size, 1, 1, 1) * outputs[i] for i in range(self.K))
        return output


In [9]:
# The Classifier turns the 2D feature maps to 1D logits.
class Classifier(nn.Module):
    def __init__(self, in_channels, num_classes=10):
        super(Classifier, self).__init__()
        self.pool = nn.AdaptiveAvgPool2d(1)  # Global spatial average
        self.fc = nn.Linear(in_channels, num_classes)

    def forward(self, x):
        x = self.pool(x)           # shape: (B, C, 1, 1)
        x = x.view(x.size(0), -1)  # shape: (B, C)
        logits = self.fc(x)        # shape: (B, num_classes)
        return logits  # softmax will be applied in loss (e.g., CrossEntropyLoss)


In [10]:
# the CNN flows: Stem → Block1 → Block2 → ... → Classifier
class CustomCNN(nn.Module):
    def __init__(self, stem_channels=64, num_blocks=3, block_out_channels=64, K=4):
        super(CustomCNN, self).__init__()
        self.stem = Stem(3, stem_channels)

        self.blocks = nn.ModuleList()
        for _ in range(num_blocks):
            block = Block(stem_channels, block_out_channels, K=K)
            self.blocks.append(block)

        self.classifier = Classifier(block_out_channels, num_classes=10)

    def forward(self, x):
        x = self.stem(x)
        for block in self.blocks:
            x = block(x)
        return self.classifier(x)


In [11]:
model = CustomCNN().to(device)

In [12]:
# loss function
criterion = torch.nn.CrossEntropyLoss()

In [19]:
# optimisation algorithm
lr = 0.01
momentum=0.9
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

In [14]:
# Applies Xavier initialization
def init_weights(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
    elif isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)

model.apply(init_weights)

CustomCNN(
  (stem): Stem(
    (conv): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (blocks): ModuleList(
    (0-2): 3 x Block(
      (conv_layers): ModuleList(
        (0-3): 4 x Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (fc1): Linear(in_features=64, out_features=16, bias=True)
      (fc2): Linear(in_features=16, out_features=4, bias=True)
      (softmax): Softmax(dim=1)
    )
  )
  (classifier): Classifier(
    (pool): AdaptiveAvgPool2d(output_size=1)
    (fc): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [15]:
# correct classification
def correct(logits, y):
    y_hat = logits.argmax(dim=1) # Finds the column with the highest value for each row of `logits`.
    return (y_hat == y).float().sum() # Computes the number of times that `y_hat` and `y` match.

In [16]:
def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    loss = 0

    with torch.no_grad():  # no gradient calculation
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)  # shape: [B, 10]
            loss += criterion(outputs, labels).item()  # Track loss
            _, predicted = torch.max(outputs, 1)  # get class with highest score

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return loss / len(dataloader), accuracy

In [None]:
# Metric tracking
train_losses = []
test_losses = []
train_accs = []
test_accs = []

num_epochs = 50

for epoch in range(num_epochs):
    print(f'\nEpoch {epoch + 1}/{num_epochs}')
    start_time = time.perf_counter()

    model.train()  # Set model to training mode
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for X, y in train_iter:
        X, y = X.to(device), y.to(device)

        # Forward pass
        logits = model(X)
        loss = criterion(logits, y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track training loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        total_train += y.size(0)
        correct_train += (predicted == y).sum().item()

    # --- Evaluation ---
    with torch.no_grad():
        model.eval()

        # Store training loss and accuracy
        train_losses.append(running_loss / len(train_iter))
        train_accs.append(100 * correct_train / total_train)
    
        # --- Validation ---
        test_loss, test_acc = evaluate_model(model, test_iter, device)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
    
        # Step the learning rate scheduler
        scheduler.step()
    
        end_time = time.perf_counter()
    
        # Print epoch details
        print(f'Training loss: {train_losses[-1]:.4f}, Training accuracy: {train_accs[-1]:.2f}%')
        print(f'Testing loss: {test_losses[-1]:.4f}, Testing accuracy: {test_accs[-1]:.2f}%')
        print(f'Duration: {end_time - start_time:.3f}s')

final_val_acc = evaluate_model(model, test_iter, device)[1]
print(f"Final validation accuracy: {final_val_acc}%")

# --- Plotting Training Curves ---
plt.figure(figsize=(12, 5))

# Plot Loss curves
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss', color='b')
plt.plot(test_losses, label='Validation Loss', color='r')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Evolution')

# Plot Accuracy curves
plt.subplot(1, 2, 2)
plt.plot(train_accs, label='Training Accuracy', color='b')
plt.plot(test_accs, label='Validation Accuracy', color='r')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.title('Accuracy Evolution')

# Show the plots
plt.tight_layout()
plt.show()

# --- Displaying Hyperparameters ---
print("\nTraining Details:")
print(f"Batch Size: {batch_size}")
print(f"Learning Rate: {lr}")
print(f"Momentum: {momentum}")
print(f"Number of Epochs: {num_epochs}")
print(f"Optimizer: SGD with momentum")


Epoch 1/50
Training loss: 0.7000, Training accuracy: 74.72%
Testing loss: 0.9830, Testing accuracy: 67.27%
Duration: 27.027s

Epoch 2/50
Training loss: 0.7013, Training accuracy: 74.66%
Testing loss: 0.9901, Testing accuracy: 67.13%
Duration: 26.384s

Epoch 3/50
Training loss: 0.6996, Training accuracy: 74.66%
Testing loss: 0.9837, Testing accuracy: 67.15%
Duration: 27.270s

Epoch 4/50
Training loss: 0.7018, Training accuracy: 74.69%
Testing loss: 0.9860, Testing accuracy: 67.21%
Duration: 26.165s

Epoch 5/50
Training loss: 0.7001, Training accuracy: 74.79%
Testing loss: 0.9831, Testing accuracy: 67.12%
Duration: 26.835s

Epoch 6/50
Training loss: 0.6994, Training accuracy: 74.77%
Testing loss: 0.9836, Testing accuracy: 67.14%
Duration: 25.980s

Epoch 7/50
Training loss: 0.6996, Training accuracy: 74.68%
Testing loss: 0.9833, Testing accuracy: 67.18%
Duration: 26.564s

Epoch 8/50
Training loss: 0.7017, Training accuracy: 74.68%
Testing loss: 0.9884, Testing accuracy: 67.25%
Duration: 