In [1]:
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
import torch.nn as nn

# Task 1 - Dataset loaders
def load_cifar_10(batch_size):
    train_transforms = transforms.Compose([transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), # Data augmentations for training data, along with normalization values (mean, std) for CIFAR-10 across 3 channels
                                            transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
                                            transforms.RandomRotation(10),
                                            transforms.ToTensor(),
                                            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])
    
    test_transforms = transforms.Compose([transforms.ToTensor(), 
                                          transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))])

    cifar_train = torchvision.datasets.CIFAR10(root="../data", train=True, transform=train_transforms, download=True)
    cifar_test = torchvision.datasets.CIFAR10(root="../data", train=False, transform=test_transforms, download=True)  # change download to true

    return data.DataLoader(cifar_train, batch_size, shuffle=True, num_workers=2), data.DataLoader(cifar_test, batch_size, shuffle=False, num_workers=2)

batch_size = 32
train_iter, test_iter = load_cifar_10(batch_size)

# Task 2 - Model
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.drop = nn.Dropout1d(0.5)

        # Block 1
        # Block 1 (a) MLP Instance variables
        self.block_1_fc = nn.Linear(3, 3)
        self.block_1_fc_bn = nn.BatchNorm1d(3)
        self.block_1_a_relu = nn.ReLU()

        # K convolutional layers for block 1 (3)
        self.block_1_convolutions = nn.ModuleList([nn.Conv2d(3, 128, kernel_size=3, padding=1) for _ in range(3)])
        self.block_1_bn = nn.BatchNorm2d(128)

        self.block_1_sequential = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # 16 x 16            
            nn.Dropout2d(0.3),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # 8 x 8
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
        ) 

        # BLock 2
        # Block 2 (a) MLP Instance variables
        self.block_2_fc_bn = nn.BatchNorm1d(128)
        self.block_2_fc_bn_1 = nn.BatchNorm1d(64)
        self.block_2_fc_bn_2 = nn.BatchNorm1d(3)

        self.block_2_fc = nn.Linear(256, 128)
        self.block_2_fc_1 = nn.Linear(128, 64)
        self.block_2_fc_2 = nn.Linear(64, 3)

        self.block_2_a_relu = nn.ReLU()

        # K convolutional layers for block 2 (3)
        self.block_2_convolutions = nn.ModuleList([nn.Conv2d(256, 384, kernel_size=3, padding=1) for _ in range(3)])
        self.block_2_bn = nn.BatchNorm2d(384)

        self.block_2_sequential = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # 4 x 4            
            nn.Dropout2d(0.3),
            nn.Conv2d(384, 384, kernel_size=3, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # 2 x 2
            nn.Conv2d(384, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
        ) 

        # Classifier MLP variables
        self.bn_c1 = nn.BatchNorm1d(256)
        self.bn_c2 = nn.BatchNorm1d(128)
        self.bn_c3 = nn.BatchNorm1d(64)

        self.fc_1 = nn.Linear(512, 256)
        self.fc_2 = nn.Linear(256, 128)
        self.fc_3 = nn.Linear(128, 64)
        self.fc_4 = nn.Linear(64, 10)

        self.class_relu = nn.ReLU()

    def forward(self, x):
        #  Block 1
        #  Linear to determine a
        #  [end shape] a.shape = [32, 3], k = 3
        #  a = [[a1,a2,...,ak],
        #       [a1,a2,...,ak]]

        a = x.mean(dim=[2, 3]) # Taking avg across H,W dimensions [Spatial Average Pool] [32, 3]
        
        a = self.block_1_fc(a) 
        a = self.block_1_fc_bn(a)
        a = self.block_1_a_relu(a)
        a = self.drop(a)

        # convk(x)
        conv_outputs = [self.block_1_bn(conv(x)) for conv in self.block_1_convolutions]       

        # a1 * conv1(x) + ... + ak * convk(x) 
        out = sum([a[i, j].item() * conv_outputs[j] for i in range(len(a)) for j in range(len(a[i]))])

        out = self.block_1_sequential(out)


        #  Block 2
        #  MLP to determine a
        #  [end shape] a_1.shape = [32, 3], k = 3
        #  a = [[a1,a2,...,a3],
        #       [a1,a2,...,a3]]

        a_1 = out.mean(dim=[2, 3]) # [32, 256]

        a_1 = self.block_2_fc(a_1)
        a_1 = self.block_2_fc_bn(a_1)
        a_1 = self.block_2_a_relu(a_1)

        a_1 = self.block_2_fc_1(a_1)
        a_1 = self.block_2_fc_bn_1(a_1)
        a_1 = self.block_2_a_relu(a_1)

        a_1 = self.block_2_fc_2(a_1)
        a_1 = self.block_2_fc_bn_2(a_1)
        a_1 = self.block_2_a_relu(a_1)

        # convk(x)
        conv_outputs = [self.block_2_bn(conv(out)) for conv in self.block_2_convolutions]

        # a1 * conv1(x) + ... + ak * convk(x) 
        out_1 = sum([a_1[i, j].item() * conv_outputs[j] for i in range(len(a_1)) for j in range(len(a_1[i]))])

        out_1 = self.block_2_sequential(out_1)

        # Classifier MLP
        out_1 = out_1.mean(dim=[2, 3]) # [32, 512]
        out_1 = self.fc_1(out_1)
        out_1 = self.bn_c1(out_1)
        out_1 = self.class_relu(out_1)

        out_1 = self.fc_2(out_1)
        out_1 = self.bn_c2(out_1)
        out_1 = self.class_relu(out_1)

        out_1 = self.fc_3(out_1)
        out_1 = self.bn_c3(out_1)
        out_1 = self.class_relu(out_1)

        out_1 = self.fc_4(out_1)
        return out_1
    
# Task 4 - Training script
def train(net, train_iter, test_iter, epochs=130):
    # Set up optimizer and loss function
    
    # Task 3 - Loss and Optimizer
    optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
    loss_function = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True, min_lr=0) # Scheduler to lower lr by factor if test loss stagnates for 10 epochs (patience)

    # Move the model to the GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net.to(device)

    # Training loop
    for epoch in range(epochs):

        # Train
        net.train()
        train_loss, correct, total = 0.0, 0, 0

        for i, (X, y) in enumerate(train_iter):

            # if (i + 1) % 100 == 0:
            #     print(f"Epoch [{epoch + 1}/{epochs}] -> Batch [{i + 1}/{50000 // batch_size}]")

            # Move to GPU
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            y_hat = net(X)
            loss = loss_function(y_hat, y)
            loss.backward()
            optimizer.step()

            # Training stats
            train_loss += loss.item()
            _, predicted = y_hat.max(1)
            total += y.size(0)
            correct += predicted.eq(y).sum().item()

        train_acc = 100. * correct / total
        train_loss /= len(train_iter)


        # Evaluate
        net.eval()
        test_loss, correct, total = 0.0, 0, 0

        # No need to back prop for inference
        with torch.no_grad():
            for (X, y) in test_iter:

                X, y = X.to(device), y.to(device)
                y_hat = net(X)
                loss = loss_function(y_hat, y)

                # Test stats
                test_loss += loss.item()
                _, predicted = y_hat.max(1)
                total += y.size(0)
                correct += predicted.eq(y).sum().item()

        test_acc = 100. * correct / total
        test_loss /= len(test_iter)
            
        scheduler.step(test_loss)
        
        # torch.save({
        #     'epoch': epoch,
        #     'model_state_dict': net.state_dict(),
        #     'optimizer_state_dict': optimizer.state_dict(),
        #     'scheduler_state_dict': scheduler.state_dict(),
        #     'train_loss': train_loss,
        #     'train_acc': train_acc,
        #     'test_acc': test_acc,
        #     'test_loss': test_loss
        #     }   , f'Test/checkpoint{epoch + 1}.pth')
        
        # Print epoch stats
        print(f'Epoch {epoch + 1}/{epochs} Train Loss: {train_loss:.3f} Train Acc: {train_acc:.2f}% Test Loss: {test_loss:.3f} Test Acc: {test_acc:.2f}%')


net = Model()
train(net, train_iter, test_iter)

Files already downloaded and verified
Files already downloaded and verified
Epoch 1/130 Train Loss: 1.914 Train Acc: 25.55% Test Loss: 1.638 Test Acc: 36.23%
Epoch 2/130 Train Loss: 1.625 Train Acc: 37.56% Test Loss: 1.548 Test Acc: 43.04%
Epoch 3/130 Train Loss: 1.385 Train Acc: 49.38% Test Loss: 1.471 Test Acc: 55.43%
Epoch 4/130 Train Loss: 1.187 Train Acc: 57.77% Test Loss: 1.305 Test Acc: 63.02%
Epoch 5/130 Train Loss: 1.057 Train Acc: 63.27% Test Loss: 1.248 Test Acc: 65.73%
Epoch 6/130 Train Loss: 0.964 Train Acc: 66.56% Test Loss: 1.002 Test Acc: 70.81%
Epoch 7/130 Train Loss: 0.892 Train Acc: 69.32% Test Loss: 0.787 Test Acc: 73.29%
Epoch 8/130 Train Loss: 0.831 Train Acc: 71.63% Test Loss: 0.793 Test Acc: 75.80%
Epoch 9/130 Train Loss: 0.781 Train Acc: 73.33% Test Loss: 0.798 Test Acc: 77.21%
Epoch 10/130 Train Loss: 0.748 Train Acc: 74.76% Test Loss: 0.729 Test Acc: 78.46%
Epoch 11/130 Train Loss: 0.703 Train Acc: 76.28% Test Loss: 0.671 Test Acc: 80.71%
Epoch 12/130 Train L