# Handwritten Digit classification using PyTorch

### Install PyTorch

By default, PyTorch is not installed in the environment. So, you have to install it every time. 

Run the cell below for installing PyTorch.

In [0]:
!pip3 install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl 
!pip3 install torchvision

### Import PyTorch library 

- Import PyTorch by using the **```import torch```** command.

- You can check the version of PyTorch by using **```torch.__version__```** command.

- Use **```torch.cuda.is_available()```** to determint if your environment support GPU, which should return **```True```**. 

- If **```torch.cuda.is_available()```** returns **```False```**, you should change the accelerator from **None** to **GPU** (Refer to lab sheet section 3.1.7). 

In [0]:
import torch
print(torch.__version__) # Return 0.3.0.post4
print(torch.cuda.is_available()) # Return True, if GPU is available

Import other necessary libraries

In [0]:
from torch import nn, optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets
import numpy as np

import time

Define the hyperparameters of the neural network.

**You will need to modify some of the training parameters for answering the questions.**

In [0]:
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

# Training parameters
batch_size = 32
learning_rate = 0.1
num_epochs = 30
weight_decay_coef = 0 # Modify this for Q.4

## MNIST handwrittern digit dataset

Create the data loaders: one for **training** and one for **evaluation**.

In [0]:
train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=1, shuffle=False)

### Visualizing the dataset

In [0]:
import matplotlib.pyplot as plt
fig=plt.figure()
columns = 4
rows = 5
for i in range(1, columns*rows+1):
    # Try to change: train_dataset -> test_dataset, and run it again
    img = train_dataset.__getitem__(i-1)[0][0].numpy()
    
    h, w = img.shape
    fig.add_subplot(rows, columns, i)
    plt.axis('off')
    plt.imshow(img, cmap=plt.get_cmap('gray'))
plt.show()

### Build a 3-layer Multi-Layer Perceptron (MLP)

![MLP](https://www.pyimagesearch.com/wp-content/uploads/2016/08/simple_neural_network_header-768x377.jpg =x200)

The reason for calling it as "3-layer" is that there are 3 sets of trainable parameters (i.e. 3 sets of weights and bias).


In [0]:
class MLP(nn.Module):
    # Define the structure
    def __init__(self, in_dim, h1_dim, h2_dim, out_dim):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(in_dim, h1_dim, bias=True)
        self.layer2 = nn.Linear(h1_dim, h2_dim, bias=True)
        self.layer3 = nn.Linear(h2_dim, out_dim)

    # Define the forward pass operations    
    def forward(self, x):
        # Reshape the img: batch_size*28*28 -> batch_size*784 
        x = x.view(x.size(0), -1)
        
        x = self.layer1(x)
        #x = F.relu(x) # Uncommend it for adding ReLU activation. Modify it for Q.3
        x = self.layer2(x)
        #x = F.relu(x) # Uncommend it for adding ReLU activation. Modify it for Q.3
        x = self.layer3(x)
        return x

### Build a Convolutional Neural Network (CNN)

![CNN](https://raw.github.com/floydhub/mnist/master/images/mnist_convet.png)

In [0]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)
        
        x = self.conv2(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)
        
        x = x.view(-1, 320)
        x = self.fc1(x)
        x = F.relu(x)
        
        x = self.fc2(x)
        
        return x

Create the models of the MLP and the CNN.

In [0]:
# Create a model of the MLP
# Define the hyperparameter as follows: 
# in_dim=28*28, h1_dim=128, h2_dim=128, out_dim=10
model_mlp = MLP(in_dim=28*28, h1_dim=128, h2_dim=128, out_dim=10)

# Convert all model's trainable parameters to CUDA Tensor
# to train with GPU
model_mlp = model_mlp.cuda()

#-------------------------------------------------------
# Create a model of the CNN
model_cnn = CNN()
model_cnn = model_cnn.cuda()

We can display the details of the model.

In [0]:
print(model_mlp)

print(model_cnn)

Also, we can list the number of trainable parameters in each layer.

**Show the number of trainable parameters in layer1 of the MLP and conv1 of the CNN**

In [0]:
for param in model_mlp.layer1.parameters(): # Modify it for Q.1
    print('Shape of parameters: ', param.shape)
    print('Number of parameters: ', param.numel())

In [0]:
for param in model_cnn.conv1.parameters(): # Modify it for Q.1
    print('Shape of parameters: ', param.shape)
    print('number of parameters: ', param.numel())

Create a loss function and an optimizer.

In [0]:
# Softmax operation is included inside the nn.CrossEntropyLoss function
# Refer to: http://pytorch.org/docs/master/nn.html#torch.nn.CrossEntropyLoss 
CELoss = nn.CrossEntropyLoss()

# Stochastic gradient descent is used as the optimizer
# One for the MLP, another one for the CNN
SGD_mlp = optim.SGD(model_mlp.parameters(), lr=learning_rate, weight_decay=weight_decay_coef)
SGD_cnn = optim.SGD(model_cnn.parameters(), lr=learning_rate, weight_decay=weight_decay_coef)

## Create the functions to train and evaluate the model

In Python, **```def```** is used to define a function. We create **```train```** and **```test```** to train the model and evaluate the performance. 

In [0]:
def train(loader, net, criterion, optimizer):
    start_time = time.time()

    # Make train model of the net is turned on
    net.train()

    running_loss = 0.0
    running_samples = 0.0
    running_correct = 0.0

    for data in loader:
        img, label = data

        running_samples += img.size(0)

        # Convert to CUDA Tensor variable
        # as we can only calculate the gradient of an variable
        img = Variable(img).cuda()
        label = Variable(label).cuda()

        # Feed forward
        output_score = net(img) # dim. of output_score: batch_size*10

        # Calculate the loss
        loss = criterion(output_score, label)

        # Backpropagation (3 steps)
        # 1. Clear the gradients (i.e. all become zero)
        # 2. Calculate the derivative of the loss with respect to the variables(trainable parameters)
        # 3. Update the parameters (i.e. model.parameters(), as defined above) by using the optimizer
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, pred = torch.max(output_score, 1)
        num_correct = (pred == label).sum().data[0]

        running_loss += loss.data[0]
        running_correct += num_correct

    # Calculate the average loss and top-1 accuracy for each epoch
    average_loss = running_loss / running_samples
    average_accuracy = running_correct / running_samples

    print('Training loss: {:.4f}, Acc: {:.4f} in {:.4f}'.format(
          average_loss, 
          average_accuracy,
          time.time()-start_time))
 
#---------------------------------------------------------------------------------

def test(loader, net, criterion):
    #In testing phase, no need to update the network.
    start_time = time.time()

    # Make eval model of the net is turned on
    net.eval()

    running_loss = 0.0
    running_samples = 0.0
    running_correct = 0.0

    for data in loader:
        img, label = data

        running_samples += img.size(0)

        # Convert to CUDA Tensor variable
        # as we can only calculate the gradient of an variable
        img = Variable(img).cuda()
        label = Variable(label).cuda()

        # Feed forward
        output_score = net(img) # dim. of output_score: batch_size*10

        # Calculate the loss
        loss = criterion(output_score, label)

        _, pred = torch.max(output_score, 1)
        num_correct = (pred == label).sum().data[0]

        running_loss += loss.data[0]
        running_correct += num_correct

    # Calculate the average loss and top-1 accuracy for each epoch
    average_loss = running_loss / running_samples
    average_accuracy = running_correct / running_samples

    print('Test loss: {:.4f}, Acc: {:.4f} in {:.4f}'.format(
          average_loss, 
          average_accuracy,
          time.time()-start_time))

In [0]:
# For training and test the MLP/CNN,
# To train and test with CNN, change model_mlp->model_cnn and change SGD_mlp->SGD_cnn
for epoch in range(1, num_epochs+1):
    print('Epoch: ', epoch)
    train(train_loader, model_mlp, criterion=CELoss, optimizer=SGD_mlp) # Modify model_mlp, SGD_mlp for Q.2 and Q.4
    test(test_loader, model_mlp, criterion=CELoss) # Modify model_mlp for Q.2 and Q.4

In [0]:
print('Display the value of the trainable parameters in each layer:')
for param in model_mlp.parameters(): # Modify for Q.4 (For CNN, change model_mlp->model_cnn)
  print('Mean: {:.4f} Min.: {:.4f} Max.:{:.4f}'.format(
      param.data.mean(), param.data.min(), param.data.max()))