# Building a Convolutional Neural Network to classify Handwritten Digits
**Dataset:** [MNIST](https://en.wikipedia.org/wiki/MNIST_database)

## Imports
Import the required libraries

In [1]:
# PyTorch, everything from PyTorch can be referred with this
import torch

# Import the neural network model seperately
import torch.nn as nn

# Contains functions that are required such as ReLu (Activation function)
import torch.nn.functional as F 

# Optimizers required to converge using Backpropogation
import torch.optim as optim

# Datasets are used to download and load the data
# used to train and validate
# Transforms can be used to define modifications and/or augmentations methods
# to be applied while passing the data
# Normalization can also be one of the compositions of transforms
from torchvision import datasets, transforms

## Define the network architecture

In [2]:
class Net(nn.Module):
    ''' Define a class initializing the layers constituting the required 
    CNN Architecture and the code for forward pass
    Note: The class extends to the nn.Module, which is a base class for 
    Neural Network modules in PyTorch https://pytorch.org/docs/stable/generated/torch.nn.Module.html
    '''

    def __init__(self):
        ''' Define the layers that constitute the network and
        initialize the base class '''

        # Start by initializing the base class
        super(Net, self).__init__()
    
        # Define the layers that make up the network
        # i.e. the Network Architecture
        # nn.Conv2d - Used to perform 2-dimensional convolution using the defined size of the kernel
        # nn.MaxPool2d - 2d MaxPooling Layer of the defined size

        # MNIST contains images of size 28x28
        # Since the images are padded, the resultant images after convolution
        # would have the same size
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1) # Input: 28x28x1; Output: 28x28x32; RF: 3x3
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1) # Input: 28x28x32; Output: 28x28x64; RF: 5x5
        self.pool1 = nn.MaxPool2d(2, 2) # Input: 28x28x64; Output: 14x14x64; RF: 10x10
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1) # Input: 14x14x64; Output: 14x14x128; RF: 12x12 
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1) # Input: 14x14x128; Output: 14x14x256; RF: 14x14 
        self.pool2 = nn.MaxPool2d(2, 2) # Input: 14x14x256; Output: 7x7x256; RF: 28x28 ??? Shouldn't the receptive field be equal to the image size at the prenultimate layer?
        self.conv5 = nn.Conv2d(256, 512, 3) # Input: 7x7x256; Output: 5x5x512; RF: 30x30
        self.conv6 = nn.Conv2d(512, 1024, 3) # Input: 5x5x512; Output: 3x3x1024; RF: 32x32
        self.conv7 = nn.Conv2d(1024, 10, 3) # Input: 3x3x1024; Output: 1x1x10; 

        
    def forward(self, x):
        ''' Define the forward pass
        Each convolution layer is activated using ReLU to add non-linearity
        '''
        # Convolution layer followed by ReLU Activation and then finally a pooling layer
        x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))

        # Convolution layer followed by ReLU Activation and then finally a pooling layer
        x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))

        # Conv followed by activation
        x = F.relu(self.conv6(F.relu(self.conv5(x))))

        # The final layer shouldn't be passed through ReLU, but shouldn't be retained
        # as is, before computing the softmax (log)
        #x = F.relu(self.conv7(x))
        x = self.conv7(x)
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [3]:
# To display an overview/summary of the network
# Is also useful to validate whether the structure of the network is 
# correct, i.e. the input channels and the output channels and its flow
!pip install torchsummary
from torchsummary import summary

# Set it to use GPU if available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# Create a model object
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
Total params: 6,379,786
Trainable params: 6,379,786
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estimated Total Size (MB): 25.85
-------------------------------------



## Load the data

In [4]:
# Seed the random generator used by PyTorch
# so that the results are reproduceable
torch.manual_seed(1)

# Define the batch size to be used
# i.e. the number of images to be used per batch
batch_size = 128

# Parameters to be used while loading the data
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

# Download & Load the training set
# Transforms are used to Normalize the data & to convert to a tensor 
# since the images are read as numpy nd-array
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

# Download & Load the test set
# Normalization similar to train
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)

## Train and Validate

In [5]:
# TQDM is just awesome... provides a progress status bar as the training 
# (or any operation) proceeds
from tqdm import tqdm


def train(model, device, train_loader, optimizer, epoch):
    ''' Define the training steps '''

    # Set the model to training mode
    model.train()

    # Initialize the progress bar
    pbar = tqdm(train_loader)

    # Start iterating through the training data
    for batch_idx, (data, target) in enumerate(pbar):
        # Start by converting the data to the required type 
        # (PyTorch Cuda Tensor while using GPU)
        data, target = data.to(device), target.to(device)

        # Discard/reset the gradients from the last iteration
        optimizer.zero_grad()

        # Get the predictions for the given data
        output = model(data)

        # Compute the negative loss likelihood of the predictions vs the actuals
        # and propogate the loss backwards (back propogation)
        loss = F.nll_loss(output, target)
        loss.backward()

        # Use the optimizer to take a step in the -ve of the gradient direction
        # by the amount equal to the gradient magnitude times the learning rate 
        optimizer.step()

        # Update the progress bar
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    ''' Validate the trained model on a hold-out set '''

    # Set the model to evalution mode
    model.eval()

    # Initialize the losses
    # and the no of correct predictions to 0
    test_loss = 0
    correct = 0

    # Disable the gradient computations
    # While evaulating only forward pass is used and the backward pass
    # along with the gradient (likewise the gradient update) isn't required
    with torch.no_grad():
        # Iterate over the test/validation set
        for data, target in test_loader:
            # Converting the data to the required type 
            # (PyTorch Cuda Tensor while using GPU)
            data, target = data.to(device), target.to(device)

            # Get the predictions
            output = model(data)

            # Compute the loss against the target
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss

            # Get the index of the prediction
            # i.e. the output is one-hot encoded, so get the argument with the max
            # log probability
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability

            # Get a count of the correct preditcions
            correct += pred.eq(target.view_as(pred)).sum().item()

    # Compute the final loss on the test/validation data
    test_loss /= len(test_loader.dataset)

    # Display the results
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [6]:
# Initialize the model instance
model = Net().to(device)

# Define an optimizer to use to perform gradient descent
# Various optimizers are suitable for different usecase, which help
# reach the global optimal (i.e. a model with least errors/loss) quickly
# In this case, Stochastic Gradient Descent (SGD) is used with momentum of 0.9
# and the learning rate (alpha) set to 0.01 
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Train the model for 2 epochs
for epoch in range(0, 2):
    print (f'Iteration {epoch+1}')
    # Initiate training phase
    train(model, device, train_loader, optimizer, epoch)

    # Validate the results on the test/validation set
    test(model, device, test_loader)

    print ('\n\n\n')

  0%|          | 0/469 [00:00<?, ?it/s]

Iteration 1


loss=0.03500483185052872 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.62it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


Test set: Average loss: 0.0704, Accuracy: 9771/10000 (98%)





Iteration 2


loss=0.07858671247959137 batch_id=468: 100%|██████████| 469/469 [00:21<00:00, 21.47it/s]



Test set: Average loss: 0.0329, Accuracy: 9883/10000 (99%)





