In [1]:
import argparse
import os, sys
import time
import datetime
import numpy as np

# Import pytorch dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tqdm

# You cannot change this line.
from tools.dataloader import CIFAR10

torch.cuda.is_available()

True

In [2]:
""" 
Assignment 2(a)
Build the LeNet-5 model by following table 1 or figure 1.

You can also insert batch normalization and leave the LeNet-5 
with batch normalization here for assignment 3(c).
"""
class flatten(nn.Module):
    def forward(self, x):
        return x.view(x.shape[0],-1)

# Create the neural network module: LeNet-5
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, kernel_size=5, stride=1, padding=0)
        self.pool1 = nn.MaxPool2d(2)
        self.conv1_bn = nn.BatchNorm2d(6)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.pool2 = nn.MaxPool2d(2)
        self.conv2_bn = nn.BatchNorm2d(16)
        
        self.flat  = flatten()
        
        self.fc1 = nn.Linear(5*5*16, 120)
        self.fc1_bn = nn.BatchNorm1d(120)
        self.fc2 = nn.Linear(120, 84)
        self.fc2_bn = nn.BatchNorm1d(84)
        self.fc3 = nn.Linear(84, 10)
        
        nn.init.xavier_normal_(self.conv1.weight)
        nn.init.xavier_normal_(self.conv2.weight)
        nn.init.xavier_normal_(self.fc1.weight)
        nn.init.xavier_normal_(self.fc2.weight)
        nn.init.xavier_normal_(self.fc3.weight)

    def forward(self, x):
        conv_pool1 = self.pool1(self.conv1_bn(F.relu(self.conv1(x))))
        conv_pool2 = self.pool2(self.conv2_bn(F.relu(self.conv2(conv_pool1))))
        
        return self.fc3(self.fc2_bn(F.relu(self.fc2(self.fc1_bn(F.relu(self.fc1(self.flat(conv_pool2))))))))

In [3]:
## test
net = LeNet5()
x = torch.zeros([128,3,32,32])
print(net(x).shape)

torch.Size([128, 10])


In [4]:
"""
Hyperparameter optimization in assignment 4(a), 4(b) can be 
conducted here.
Be sure to leave only your best hyperparameter combination
here and comment the original hyperparameter settings.
"""

# Setting some hyperparameters
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 100
INITIAL_LR = 0.04
MOMENTUM = 0.9
REG = 1e-4
EPOCHS = 100
DATAROOT = "./data"
CHECKPOINT_PATH = "./saved_model"

**Your answer:**

We test batch size = 64, 128, 256, with 128 having the best performance. Learning rate is an important hyperparameter, we test lr = 0.005, 0.01, 0.02, 0.04, 0.08 where 0.04 has the best performance. For momentum, we tried 0.7, 0.8, 0.9. For REG, we tried 0.5e-4, 1e-4, 2e-4.

In [5]:
"""
Assignment 2(b)
Write functions to load dataset and preprocess the incoming data. 
We recommend that the preprocess scheme \textbf{must} include 
normalize, standardization, batch shuffling to make sure the training 
process goes smoothly. 
The preprocess scheme may also contain some data augmentation methods 
(e.g., random crop, random flip, etc.). 

Reference value for mean/std:

mean(RGB-format): (0.4914, 0.4822, 0.4465)
std(RGB-format): (0.2023, 0.1994, 0.2010)


NOTE: Considering this process has strong corrlelation with assignment 3(b), 
please leave the data preprocessing method which can achieve the highest 
validation accuracy here. You can include your original data augmentation
method as comments and denotes the accuracy difference between thest two 
methods.
"""
# Specify preprocessing function.
# Reference mean/std value for 
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])




**Your answer:**

Originally we only used transforms.ToTensor() and transforms.Normalize. After that we try to apply transform.RandomCrop and RandomHorizontalFlip() for the training set. In order to keep the size after randomcrop the same as original size, we add some padding (padding = 4). 

In [6]:
# Call the dataset Loader
trainset = CIFAR10(root=DATAROOT, train=True, download=False, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=1)
valset = CIFAR10(root=DATAROOT, train=False, download=False, transform=transform_val)
valloader = torch.utils.data.DataLoader(valset, batch_size=VAL_BATCH_SIZE, shuffle=False, num_workers=1)

In [7]:
# Specify the device for computation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net = LeNet5()
net = net.to(device)
if device =='cuda':
    print("Train on GPU...")
else:
    print("Train on CPU...")

Train on GPU...


In [8]:
# FLAG for loading the pretrained model
TRAIN_FROM_SCRATCH = False
# Code for loading checkpoint and recover epoch id.
CKPT_PATH = "./saved_model/model.h5"
def get_checkpoint(ckpt_path):
    try:
        ckpt = torch.load(ckpt_path)
    except Exception as e:
        print(e)
        return None
    return ckpt

ckpt = get_checkpoint(CKPT_PATH)
if ckpt is None or TRAIN_FROM_SCRATCH:
    if not TRAIN_FROM_SCRATCH:
        print("Checkpoint not found.")
    print("Training from scratch ...")
    start_epoch = 0
    current_learning_rate = INITIAL_LR
else:
    print("Successfully loaded checkpoint: %s" %CKPT_PATH)
    net.load_state_dict(ckpt['net'])
    start_epoch = ckpt['epoch'] + 1
    current_learning_rate = ckpt['lr']
    print("Starting from epoch %d " %start_epoch)

print("Starting from learning rate %f:" %current_learning_rate)

Successfully loaded checkpoint: ./saved_model/model.h5
Starting from epoch 6 
Starting from learning rate 0.036100:


In [9]:
"""
Assignment 2(c)
In the targeted classification task, we use cross entropy loss with L2 
regularization as the learning object.
You need to formulate the cross-entropy loss function in PyTorch.
You should also specify a PyTorch Optimizer to optimize this loss function.
We recommend you to use the SGD-momentum with an initial learning rate 0.01 
and momentum 0.9 as a start.
"""
# Create loss function and specify regularization
criterion = nn.CrossEntropyLoss()
# Add optimizer
optimizer = optim.SGD(net.parameters(), lr=INITIAL_LR, momentum=MOMENTUM)

In [10]:
"""
Assignment 3(a)
Start the training process over the whole CIFAR-10 training dataset. 
For sanity check, you are required to report the initial loss value at 
the beginning of the training process and briefly justify this value. 
Run the training process for \textbf{a maximum of 30} epochs and you 
should be able to reach around \textbf{65\%} accuracy on the validation 
dataset.
"""
# Start the training/validation process
# The process should take about 5 minutes on a GTX 1070-Ti
# if the code is written efficiently.
global_step = 0
best_val_acc = 0

train_acc_recorder = []
train_loss_recorder = []
val_acc_recorder = []
val_loss_recorder = []

## calculating original loss
net.train()
for batch_idx, (inputs, targets) in enumerate(trainloader):
    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = net(inputs)
    loss = criterion(outputs, targets)
    print(loss)
    if batch_idx>0:
        break

for i in range(start_epoch, EPOCHS):
    print(datetime.datetime.now())
    # Switch to train mode
    net.train()
    print("Epoch %d:" %i)

    total_examples = 0
    correct_examples = 0

    train_loss = 0
    train_acc = 0
    # Train the training dataset for 1 epoch.
    print(len(trainloader))
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        # Copy inputs to device
        inputs = inputs.to(device)
        targets = targets.to(device)
        
        # Zero the gradient
        optimizer.zero_grad()
        # Generate output
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        for param in net.parameters():
            loss += REG * (param**2).sum()
        # Now backward loss
        loss.backward()
        # Apply gradient
        optimizer.step()
        # Calculate predicted labels
        _, predicted = torch.max(outputs, 1)
        # Calculate accuracy
        total_examples += len(predicted)
        correct_examples += (predicted==targets).sum().item()
        train_loss += loss

        global_step += 1
        if global_step % 100 == 0:
            avg_loss = train_loss / (batch_idx + 1)
        pass
    avg_acc = correct_examples / total_examples
    print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss, avg_acc))
    
    train_acc_recorder.append(avg_acc)
    train_loss_recorder.append(avg_loss)
    
    print(datetime.datetime.now())
    # Validate on the validation dataset
    print("Validation...")
    total_examples = 0
    correct_examples = 0
    
    net.eval()

    val_loss = 0
    val_acc = 0
    # Disable gradient during validation
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(valloader):
            # Copy inputs to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            # Zero the gradient
            optimizer.zero_grad()
            # Generate output from the DNN.
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            # Calculate predicted labels
            _, predicted = outputs.max(1)
            # Calculate accuracy
            total_examples += len(predicted)
            correct_examples += (predicted==targets).sum().item()
            val_loss += loss

    avg_loss = val_loss / len(valloader)
    avg_acc = correct_examples / total_examples
    print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss, avg_acc))
    
    val_acc_recorder.append(avg_acc)
    val_loss_recorder.append(avg_loss)
        
    """
    Assignment 4(b)
    Learning rate is an important hyperparameter to tune. Specify a 
    learning rate decay policy and apply it in your training process. 
    Briefly describe its impact on the learning curveduring your 
    training process.    
    Reference learning rate schedule: 
    decay 0.98 for every 2 epochs. You may tune this parameter but 
    minimal gain will be achieved.
    Assignment 4(c)
    As we can see from above, hyperparameter optimization is critical 
    to obtain a good performance of DNN models. Try to fine-tune the 
    model to over 70% accuracy. You may also increase the number of 
    epochs to up to 100 during the process. Briefly describe what you 
    have tried to improve the performance of the LeNet-5 model.
    """
    DECAY_EPOCHS = 2
    DECAY = 0.95
    if i % DECAY_EPOCHS == 0 and i != 0:
        current_learning_rate = current_learning_rate*DECAY
        for param_group in optimizer.param_groups:
            param_group['lr'] = current_learning_rate
            # Assign the learning rate parameter
            
        print("Current learning rate has decayed to %f" %current_learning_rate)
    
    # Save for checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        if not os.path.exists(CHECKPOINT_PATH):
            os.makedirs(CHECKPOINT_PATH)
        print("Saving ...")
        state = {'net': net.state_dict(),
                 'epoch': i,
                 'lr': current_learning_rate}
        torch.save(state, os.path.join(CHECKPOINT_PATH, 'model.h5'))

print("Optimization finished.")


tensor(1.2089, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.0819, device='cuda:0', grad_fn=<NllLossBackward>)
2019-09-18 16:08:19.146292
Epoch 6:
352
Training loss: 1.1899, Training accuracy: 0.5979
2019-09-18 16:08:34.277436
Validation...
Validation loss: 1.0575, Validation accuracy: 0.6184
Current learning rate has decayed to 0.034295
Saving ...
2019-09-18 16:08:35.510614
Epoch 7:
352
Training loss: 1.1592, Training accuracy: 0.6108
2019-09-18 16:08:48.757422
Validation...
Validation loss: 1.0092, Validation accuracy: 0.6328
Saving ...
2019-09-18 16:08:49.998400
Epoch 8:
352
Training loss: 1.1403, Training accuracy: 0.6178
2019-09-18 16:09:03.165051
Validation...
Validation loss: 1.0268, Validation accuracy: 0.6320
Current learning rate has decayed to 0.032580
2019-09-18 16:09:04.363416
Epoch 9:
352
Training loss: 1.1222, Training accuracy: 0.6253
2019-09-18 16:09:18.291413
Validation...
Validation loss: 0.9832, Validation accuracy: 0.6476
Saving ...
2019-09-18 16:09:19.49406

Most answers are in the report file.

Note that it's interesting that without Xavier normalization, the original loss is around log10 = 2.3. However, when I add Xavier normalization, the original loss grows larger to around 3.