In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import torch.nn as nn
import torchvision.datasets as datasets
from torch.utils.data import random_split, DataLoader
from collections import Counter
torch.cuda.empty_cache()
import torch.optim as optim
import torch.nn.functional as F
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
from google.colab import files
from scipy.stats import beta

The CIFAR-100 training set you get from torchvision contains 50,000 images. Randomly divide
this dataset into a new training set and a validation set, containing 40,000 and 10,000 data points
respectively. Use random seed 0 for the partitioning. Show the following in your report.
1. The line(s) of code you use to partition the data.


In [2]:
# Load the CIFAR-100 dataset from torchvision
cifar100Data = datasets.CIFAR100(root='./data', train=True, download=True, transform=transforms.ToTensor())

# Get class names
class_names = cifar100Data.classes

# Split the dataset into training and validation sets
trainSize = int((4/5) * len(cifar100Data))
validationSize = len(cifar100Data) - trainSize
seed = 0
trainData, testData = random_split(cifar100Data, [trainSize, validationSize], generator=torch.Generator().manual_seed(seed))



Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:01<00:00, 101735833.60it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data


2. The proportion of each class in the new training set.

In [None]:
# Access the labels and class names of the training data
trainLabels = [label for _, label in trainData]

# Count the occurrences of each label
labelCounts = Counter(trainLabels)

# Calculate the proportion of each class
classProportions = {label: count / len(trainLabels) for label, count in labelCounts.items()}

# Sort the class proportions by label in ascending order
classProportions_sorted = dict(sorted(classProportions.items()))

# Print the sorted class proportions with class names
print("Class Proportions in the training data (Sorted):")
for label, proportion in classProportions_sorted.items():
    class_name = class_names[label]
    print(f"{class_name}: {proportion*100:.4f}%")


Compute the mean and standard deviation for each color channel on the training set. Report these
numbers and use them in your preprocessing pipeline to whiten / normalize the data.

In [3]:
# Calculate mean and standard deviation for each color channel
data = next(iter(DataLoader(trainData, batch_size=len(trainData), shuffle=False)))
mean = torch.mean(data[0], dim=(0, 2, 3))
std = torch.std(data[0], dim=(0, 2, 3))

# Print the computed mean and standard deviation
print("Mean for each color channel:", mean)
print("Standard deviation for each color channel:", std)


Mean for each color channel: tensor([0.5068, 0.4861, 0.4403])
Standard deviation for each color channel: tensor([0.2671, 0.2563, 0.2759])


3. We will first investigate the initial learning rate. Run three experiments with the learning rate set to 0.5, 0.05, and 0.01 respectively. The batch size should be set to 128. Train the networks for 15 epochs under each setting.

In [4]:
def main(args):
    # fix random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    torch.use_deterministic_algorithms(True)

    # train val test
    # AI6103 students: You need to create the dataloaders youself
    train_loader, valid_loader = get_train_valid_loader(args.batch_size,args.seed)
    if args.test:
        test_loader = get_test_loader(args.batch_size)

    # model
    model = MobileNet(100)
    print(model)
    model.cuda()

    # criterion
    criterion = torch.nn.CrossEntropyLoss().cuda()

    # optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd)
    if args.lr_scheduler:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
    else:
        scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1.0, total_iters=args.epochs)

    stat_training_loss = []
    stat_val_loss = []
    stat_training_acc = []
    stat_val_acc = []
    for epoch in range(args.epochs):
        training_loss = 0
        training_acc = 0
        training_samples = 0
        val_loss = 0
        val_acc = 0
        val_samples = 0
        # training
        model.train()
        for imgs, labels in train_loader:
            imgs = imgs.cuda()
            labels = labels.cuda()

            batch_size = imgs.shape[0]
            optimizer.zero_grad()
            logits = model.forward(imgs)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            _, top_class = logits.topk(1, dim=1)
            equals = top_class == labels.view(*top_class.shape)
            training_acc += torch.sum(equals.type(torch.FloatTensor)).item()
            training_loss += batch_size * loss.item()
            training_samples += batch_size
        # validation
        model.eval()
        for val_imgs, val_labels in valid_loader:
            batch_size = val_imgs.shape[0]
            val_logits = model.forward(val_imgs.cuda())
            loss = criterion(val_logits, val_labels.cuda())
            _, top_class = val_logits.topk(1, dim=1)
            equals = top_class == val_labels.cuda().view(*top_class.shape)
            val_acc += torch.sum(equals.type(torch.FloatTensor)).item()
            val_loss += batch_size * loss.item()
            val_samples += batch_size
        assert val_samples == 10000
        # update stats
        stat_training_loss.append(training_loss/training_samples)
        stat_val_loss.append(val_loss/val_samples)
        stat_training_acc.append(training_acc/training_samples)
        stat_val_acc.append(val_acc/val_samples)
        # print
        print(f"Epoch {(epoch+1):d}/{args.epochs:d}.. Learning rate: {scheduler.get_lr()[0]:.4f}.. Train loss: {(training_loss/training_samples):.4f}.. Train acc: {(training_acc/training_samples):.4f}.. Val loss: {(val_loss/val_samples):.4f}.. Val acc: {(val_acc/val_samples):.4f}")
        # lr scheduler
        scheduler.step()
    # plot
    plot_loss_acc(stat_training_loss, stat_val_loss, stat_training_acc, stat_val_acc, args.fig_name)
    # test
    if args.test:
        test_loss = 0
        test_acc = 0
        test_samples = 0
        for test_imgs, test_labels in test_loader:
            batch_size = test_imgs.shape[0]
            test_logits = model.forward(test_imgs.cuda())
            test_loss = criterion(test_logits, test_labels.cuda())
            _, top_class = test_logits.topk(1, dim=1)
            equals = top_class == test_labels.cuda().view(*top_class.shape)
            test_acc += torch.sum(equals.type(torch.FloatTensor)).item()
            test_loss += batch_size * test_loss.item()
            test_samples += batch_size
        assert test_samples == 10000
        print('Test loss: ', test_loss/test_samples)
        print('Test acc: ', test_acc/test_samples)


def get_test_loader(batch_size):
    # Define transformations
    transformTest = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    testDataTransformed = datasets.CIFAR100(
        root='./data', train=False, download=True, transform=transformTest
    )
    testData = DataLoader(testDataTransformed, batch_size=batch_size,shuffle=False)
    return testData

def get_train_valid_loader(batch_size,seed):
    # Define transformations
    transformation = transforms.Compose([
        transforms.Pad(4),
        transforms.RandomCrop(32),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    # Apply transformations to training data
    trainDataTransformed = datasets.CIFAR100(
        root='./data', train=True, download=True, transform=transformation
    )
    TrainData, ValidationData = random_split(trainDataTransformed, [trainSize, validationSize], generator=torch.Generator().manual_seed(seed))
    trainingData = DataLoader(TrainData, batch_size=batch_size,shuffle=True)
    validationData  = DataLoader(ValidationData, batch_size=batch_size,shuffle=True)
    return trainingData,validationData

In [5]:
class Block(nn.Module):
    '''Depthwise conv + Pointwise conv'''
    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out


class MobileNet(nn.Module):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]

    def __init__(self, num_classes=10):
        super(MobileNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.linear = nn.Linear(1024, num_classes)

    def _make_layers(self, in_planes):
        layers = []
        for x in self.cfg:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

In [6]:
def plot_loss_acc(train_loss, val_loss, train_acc, val_acc, fig_name):
    x = np.arange(len(train_loss))
    max_loss = max(max(train_loss), max(val_loss))

    fig, ax1 = plt.subplots()
    ax1.set_xlabel('epoch')
    ax1.set_ylabel('loss')
    ax1.set_ylim([0,max_loss+1])
    lns1 = ax1.plot(x, train_loss, 'yo-', label='train_loss')
    lns2 = ax1.plot(x, val_loss, 'go-', label='val_loss')
    # ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()
    ax2.set_ylabel('accuracy')
    ax2.set_ylim([0,1])
    lns3 = ax2.plot(x, train_acc, 'bo-', label='train_acc')
    lns4 = ax2.plot(x, val_acc, 'ro-', label='val_acc')
    # ax2.tick_params(axis='y', labelcolor='tab:red')

    lns = lns1+lns2+lns3+lns4
    labs = [l.get_label() for l in lns]
    ax2.legend(lns, labs, loc=0)

    fig.tight_layout()
    plt.title(fig_name)

    plt.savefig(f'{fig_name}.png')
    files.download(f'{fig_name}.png')
    # Directory where you want to save the .npz file
    directory = './content'

    # Create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)
    np.savez(os.path.join(directory, f'{fig_name}.npz'), train_loss=train_loss, val_loss=val_loss, train_acc=train_acc, val_acc=val_acc)


In [7]:
class Args:
    def __init__(self,batchSize, learningRate,epoch,weightDecay,learningRateScheduler,figureName):
        # Define the default values for your arguments
        self.batch_size = batchSize
        self.lr = learningRate
        self.epochs = epoch
        self.wd=weightDecay
        self.lr_scheduler=learningRateScheduler
        self.fig_name=figureName
        self.test=True
        self.seed=0

In [None]:


# Reinitialize the model and optimizer for each learning rate
learning_rates = [0.5, 0.05, 0.01]
epochQ3 = 15
for lr in learning_rates:
    main(Args(128,lr,epochQ3,0,False,f"Accuracy and Loss (Learning Rate = {lr})"))


Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, ep



Epoch 1/15.. Learning rate: 0.5000.. Train loss: 4.6016.. Train acc: 0.0156.. Val loss: 4.4590.. Val acc: 0.0275
Epoch 2/15.. Learning rate: 0.5000.. Train loss: 4.2118.. Train acc: 0.0469.. Val loss: 4.1078.. Val acc: 0.0588
Epoch 3/15.. Learning rate: 0.5000.. Train loss: 3.9210.. Train acc: 0.0793.. Val loss: 3.7865.. Val acc: 0.0979
Epoch 4/15.. Learning rate: 0.5000.. Train loss: 3.6525.. Train acc: 0.1195.. Val loss: 3.5923.. Val acc: 0.1327
Epoch 5/15.. Learning rate: 0.5000.. Train loss: 3.4069.. Train acc: 0.1603.. Val loss: 3.5537.. Val acc: 0.1495
Epoch 6/15.. Learning rate: 0.5000.. Train loss: 3.1838.. Train acc: 0.2004.. Val loss: 3.3122.. Val acc: 0.1901
Epoch 7/15.. Learning rate: 0.5000.. Train loss: 2.9909.. Train acc: 0.2349.. Val loss: 3.2752.. Val acc: 0.1912
Epoch 8/15.. Learning rate: 0.5000.. Train loss: 2.8171.. Train acc: 0.2698.. Val loss: 2.9834.. Val acc: 0.2589
Epoch 9/15.. Learning rate: 0.5000.. Train loss: 2.6770.. Train acc: 0.2992.. Val loss: 2.8971..

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test loss:  tensor(0.0040, device='cuda:0', grad_fn=<DivBackward0>)
Test acc:  0.3992
Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test loss:  tensor(0.0032, device='cuda:0', grad_fn=<DivBackward0>)
Test acc:  0.4648
Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test loss:  tensor(0.0035, device='cuda:0', grad_fn=<DivBackward0>)
Test acc:  0.4053


4. Next, we gradually decrease the learning rate. One effective learning rate schedule is cosine annealing. Describe this particular schedule intuitively and with one or more mathematical equations (5%). Use the best learning rate identified earlier as the initial learning rate and keep all other settings and hyperparameters unchanged. Conduct experiments under two settings: (1) train for 300 epochs with the learning rate held constant, and (2) train for 300 epochs with cosine annealing, which decreases the initial learning rate to zero over the entirety of the training session.

In [None]:
epochQ4 = 300
#Cosine Scheduler
main(Args(128,0.05,epochQ4,0,True,"Accuracy and Loss with Cosine Annealing learning rate schedule"))
#No Scheduler
main(Args(128,0.05,epochQ4,0,False,"Accuracy and Loss without learning rate schedule"))

Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, ep



Epoch 1/300.. Learning rate: 0.0500.. Train loss: 4.1953.. Train acc: 0.0614.. Val loss: 3.9311.. Val acc: 0.0898
Epoch 2/300.. Learning rate: 0.0500.. Train loss: 3.6738.. Train acc: 0.1269.. Val loss: 3.5207.. Val acc: 0.1525
Epoch 3/300.. Learning rate: 0.0500.. Train loss: 3.3706.. Train acc: 0.1792.. Val loss: 3.3240.. Val acc: 0.1863
Epoch 4/300.. Learning rate: 0.0500.. Train loss: 3.1469.. Train acc: 0.2220.. Val loss: 3.0946.. Val acc: 0.2336
Epoch 5/300.. Learning rate: 0.0500.. Train loss: 2.9350.. Train acc: 0.2587.. Val loss: 2.9913.. Val acc: 0.2672
Epoch 6/300.. Learning rate: 0.0500.. Train loss: 2.7561.. Train acc: 0.2941.. Val loss: 2.7416.. Val acc: 0.2997
Epoch 7/300.. Learning rate: 0.0499.. Train loss: 2.5856.. Train acc: 0.3256.. Val loss: 2.6451.. Val acc: 0.3149
Epoch 8/300.. Learning rate: 0.0499.. Train loss: 2.4397.. Train acc: 0.3523.. Val loss: 2.5311.. Val acc: 0.3456
Epoch 9/300.. Learning rate: 0.0499.. Train loss: 2.3150.. Train acc: 0.3799.. Val loss:

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test loss:  tensor(0.0047, device='cuda:0', grad_fn=<DivBackward0>)
Test acc:  0.5686
Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64



Epoch 1/300.. Learning rate: 0.0500.. Train loss: 4.1953.. Train acc: 0.0614.. Val loss: 3.9311.. Val acc: 0.0898
Epoch 2/300.. Learning rate: 0.0500.. Train loss: 3.6726.. Train acc: 0.1258.. Val loss: 3.5276.. Val acc: 0.1504
Epoch 3/300.. Learning rate: 0.0500.. Train loss: 3.3727.. Train acc: 0.1783.. Val loss: 3.3220.. Val acc: 0.1871
Epoch 4/300.. Learning rate: 0.0500.. Train loss: 3.1293.. Train acc: 0.2248.. Val loss: 3.0187.. Val acc: 0.2423
Epoch 5/300.. Learning rate: 0.0500.. Train loss: 2.9283.. Train acc: 0.2584.. Val loss: 2.8833.. Val acc: 0.2757
Epoch 6/300.. Learning rate: 0.0500.. Train loss: 2.7428.. Train acc: 0.2946.. Val loss: 2.7538.. Val acc: 0.3036
Epoch 7/300.. Learning rate: 0.0500.. Train loss: 2.5710.. Train acc: 0.3286.. Val loss: 2.6697.. Val acc: 0.3108
Epoch 8/300.. Learning rate: 0.0500.. Train loss: 2.4344.. Train acc: 0.3560.. Val loss: 2.6083.. Val acc: 0.3322
Epoch 9/300.. Learning rate: 0.0500.. Train loss: 2.2868.. Train acc: 0.3874.. Val loss:

5. Weight Decay
Add weight decay to the best learning rate you discovered, and the cosine learning rate schedule.
Other configurations should remain identical to the previous experiment. Experiment with two different
weight decay coefficients λ = 5 × 10−4 and 1 × 10−4, and illustrate their regularization effects using
training-curve diagrams. Report the final losses and accuracy values for both the training set and the
validation set. The network should be trained for 300 epochs.

In [None]:
epochQ5 = 300
#wd=0.0005, Cosine Annealing
main(Args(128,0.05,epochQ5,0.0005,True,"Accuracy and Loss with weight decay = 0.0005"))

#wd=0.0001, Cosine Annealing
main(Args(128,0.05,epochQ5,0.0001,True,"Accuracy and Loss with weight decay = 0.0001"))

Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, ep



Epoch 1/300.. Learning rate: 0.0500.. Train loss: 4.1763.. Train acc: 0.0612.. Val loss: 3.8822.. Val acc: 0.0880
Epoch 2/300.. Learning rate: 0.0500.. Train loss: 3.6643.. Train acc: 0.1257.. Val loss: 3.5245.. Val acc: 0.1479
Epoch 3/300.. Learning rate: 0.0500.. Train loss: 3.3642.. Train acc: 0.1752.. Val loss: 3.3454.. Val acc: 0.1820
Epoch 4/300.. Learning rate: 0.0500.. Train loss: 3.1196.. Train acc: 0.2224.. Val loss: 3.0821.. Val acc: 0.2320
Epoch 5/300.. Learning rate: 0.0500.. Train loss: 2.8845.. Train acc: 0.2624.. Val loss: 2.9797.. Val acc: 0.2566
Epoch 6/300.. Learning rate: 0.0500.. Train loss: 2.6881.. Train acc: 0.3018.. Val loss: 2.6950.. Val acc: 0.3013


6. Data Augmentation
With the best experimental setup you discovered so far, experiment with the mixup augmentation
technique [3]. Set the hyperparameter α to 0.2. Draw the probability density function associated with
the beta distribution parameterized by this α (5%). Train the network for 300 epochs.

In [8]:
def mainMixUp(args):
    # fix random seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    torch.use_deterministic_algorithms(True)

    # train val test
    # AI6103 students: You need to create the dataloaders youself
    train_loader, valid_loader = get_train_valid_loader(args.batch_size,args.seed)
    if args.test:
        test_loader = get_test_loader(args.batch_size)

    # model
    model = MobileNet(100)
    print(model)
    model.cuda()

    # criterion
    criterion = torch.nn.CrossEntropyLoss().cuda()

    # optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd)
    if args.lr_scheduler:
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs)
    else:
        scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1.0, total_iters=args.epochs)

    stat_training_loss = []
    stat_val_loss = []
    stat_training_acc = []
    stat_val_acc = []
    for epoch in range(args.epochs):
        training_loss = 0
        training_acc = 0
        training_samples = 0
        val_loss = 0
        val_acc = 0
        val_samples = 0
        # training
        model.train()
        for imgs, labels in train_loader:
            imgs = imgs.cuda()
            labels = labels.cuda()
            batch_size = imgs.shape[0]
            optimizer.zero_grad()

            newImgs, ya, yb, lambdaMixUp = mixUpData(imgs, labels, 0.2)

            logits = model.forward(newImgs)
            loss = lambdaMixUp * criterion(logits, ya) + (1 - lambdaMixUp) * criterion(logits, yb)

            loss.backward()
            optimizer.step()
            _, top_class = logits.topk(1, dim=1)
            equals = top_class == labels.view(*top_class.shape)
            training_acc += torch.sum(equals.type(torch.FloatTensor)).item()
            training_loss += batch_size * loss.item()
            training_samples += batch_size
        # validation
        model.eval()
        for val_imgs, val_labels in valid_loader:
            batch_size = val_imgs.shape[0]
            val_logits = model.forward(val_imgs.cuda())
            loss = criterion(val_logits, val_labels.cuda())
            _, top_class = val_logits.topk(1, dim=1)
            equals = top_class == val_labels.cuda().view(*top_class.shape)
            val_acc += torch.sum(equals.type(torch.FloatTensor)).item()
            val_loss += batch_size * loss.item()
            val_samples += batch_size
        assert val_samples == 10000
        # update stats
        stat_training_loss.append(training_loss/training_samples)
        stat_val_loss.append(val_loss/val_samples)
        stat_training_acc.append(training_acc/training_samples)
        stat_val_acc.append(val_acc/val_samples)
        # print
        print(f"Epoch {(epoch+1):d}/{args.epochs:d}.. Learning rate: {scheduler.get_lr()[0]:.4f}.. Train loss: {(training_loss/training_samples):.4f}.. Train acc: {(training_acc/training_samples):.4f}.. Val loss: {(val_loss/val_samples):.4f}.. Val acc: {(val_acc/val_samples):.4f}")
        # lr scheduler
        scheduler.step()
    # plot
    plot_loss_acc(stat_training_loss, stat_val_loss, stat_training_acc, stat_val_acc, args.fig_name)
    # test
    if args.test:
        test_loss = 0
        test_acc = 0
        test_samples = 0
        for test_imgs, test_labels in test_loader:
            batch_size = test_imgs.shape[0]
            test_logits = model.forward(test_imgs.cuda())
            test_loss = criterion(test_logits, test_labels.cuda())
            _, top_class = test_logits.topk(1, dim=1)
            equals = top_class == test_labels.cuda().view(*top_class.shape)
            test_acc += torch.sum(equals.type(torch.FloatTensor)).item()
            test_loss += batch_size * test_loss.item()
            test_samples += batch_size
        assert test_samples == 10000
        print('Test loss: ', test_loss/test_samples)
        print('Test acc: ', test_acc/test_samples)

def mixUpData(input, y, alpha):
    if alpha > 0:
        lambdaMixUp = np.random.beta(alpha, alpha)
    else:
        lambdaMixUp = 1

    batch_size = input.size(0)
    index = torch.randperm(batch_size).cuda()

    newInput = lambdaMixUp * input + (1 - lambdaMixUp) * input[index, :]
    ya, yb = y, y[index]
    return newInput, ya,yb,lambdaMixUp


In [9]:
epochQ6=300
mainMixUp(Args(128,0.05,epochQ6,0.0005,True,"Accuracy and Loss with mix up of alpha = 0.2"))

Files already downloaded and verified
Files already downloaded and verified
MobileNet(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Block(
      (conv1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Block(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=64, bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn2): BatchNorm2d(128, ep



Epoch 1/300.. Learning rate: 0.0500.. Train loss: 4.2633.. Train acc: 0.0334.. Val loss: 3.8645.. Val acc: 0.0951
Epoch 2/300.. Learning rate: 0.0500.. Train loss: 3.8562.. Train acc: 0.0663.. Val loss: 3.5733.. Val acc: 0.1376
Epoch 3/300.. Learning rate: 0.0500.. Train loss: 3.6799.. Train acc: 0.0814.. Val loss: 3.4297.. Val acc: 0.1679
Epoch 4/300.. Learning rate: 0.0500.. Train loss: 3.4898.. Train acc: 0.1051.. Val loss: 3.3132.. Val acc: 0.2016
Epoch 5/300.. Learning rate: 0.0500.. Train loss: 3.3032.. Train acc: 0.1258.. Val loss: 3.0228.. Val acc: 0.2445
Epoch 6/300.. Learning rate: 0.0500.. Train loss: 3.1771.. Train acc: 0.1452.. Val loss: 2.8765.. Val acc: 0.2754
Epoch 7/300.. Learning rate: 0.0499.. Train loss: 2.9986.. Train acc: 0.1526.. Val loss: 2.7300.. Val acc: 0.2958
Epoch 8/300.. Learning rate: 0.0499.. Train loss: 2.9118.. Train acc: 0.1642.. Val loss: 2.6717.. Val acc: 0.3218
Epoch 9/300.. Learning rate: 0.0499.. Train loss: 2.7656.. Train acc: 0.1804.. Val loss:

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Test loss:  tensor(0.0017, device='cuda:0', grad_fn=<DivBackward0>)
Test acc:  0.7066


In [None]:
def ProbabilityDensityFunction(alpha):
  randomVariable = beta(alpha, alpha)
  observedValue = np.linspace(0, 1, 1000)
  plt.plot(observedValue, randomVariable.pdf(observedValue), 'r-', lw=2.6)
  plt.title(f'Beta Distribution with alpha=beta={alpha}')
  plt.xlabel('Lambda')
  plt.ylabel('Probability Density')
  plt.ylim(0, 3)
  plt.savefig('ProbabilityDensityFunction.png')
  files.download('ProbabilityDensityFunction.png')

# Plot the beta distribution
ProbabilityDensityFunction(0.2)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>