<a href="https://colab.research.google.com/github/JosephKJ/DL-Tutorial/blob/master/2_Basic_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lets BackProp!

We will train an MLP with one hidden layer.

This tutorial has two parts: 

* Implementing Back-propagation from scratch 
* Using the in-built 'Autograd' module to train the MLP network.


## Import all the required packages

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np

## Initialize the variables


In [0]:
batch_size = 32 # Batch size
input_dim = 784 # Input dimension (For MNIST dataset each image is of size 28 x 28 = 784)
num_of_hidden_nodes = 100 # number of hidden nodes in hidden layer
output_dim = 10 # Number of output nodes = no of classes in th dataset. In this case it is 10

learning_rate = 0.1
num_epochs = 5

## Load the MNIST data. 
For convenience we have already downloaded the MNIST dataset and saved in the '../../data' folder. So, the argument download is set to 'False'. We then whiten the dataset.

In [0]:
mnist_dataset = datasets.MNIST('./data', train=True, download=True,
                                                          transform=transforms.Compose([
                                                              transforms.ToTensor(),
                                                              transforms.Normalize((0.1307,), (0.3081,))]))

train_loader = torch.utils.data.DataLoader(mnist_dataset, batch_size=batch_size, shuffle=True)

## Sigmoid activation function and its derivative

$\sigma(x)=\frac{1}{1+e^{-x}}$

$\sigma^{'}(x) = \sigma(x)(1-\sigma(x))$

In [0]:
def sigmoid(x):
    return 1/torch.exp(x.mul(-1)).add(1)
    

def sigmoid_diff(x):
    return torch.mul(sigmoid(x), sigmoid(x).mul(-1).add(1))


In [0]:
# Helper Class and Functions
class Metrics:
    def __init__(self):
        self.val = 0
        self.sum = 0
        self.count = 0
        self.avg = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val*n
        self.count += n
        self.avg = self.sum / self.count
        
        
def compute_accuracy(output, target, topk=(1,)):
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    result = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        result.append(correct_k.mul_(100.0 / batch_size))
    return result


def one_hot(data, max_value):
    ones = torch.sparse.torch.eye(max_value)
    return ones.index_select(0, data)

## Initialize the weight matrices with some random values

$W_1 \in \mathbb{R}^{784 x 100}$

$W_2 \in \mathbb{R}^{100 x 10}$

In [0]:
# Initiliaze the weights
W_1 = torch.randn(input_dim, num_of_hidden_nodes).type(torch.FloatTensor) # Weights between input and hidden layer
W_2 = torch.randn(num_of_hidden_nodes, output_dim).type(torch.FloatTensor) # Weights between hidden layer and output

## The training loop with manual backpropagation

In each epoch, we will have several batches of data. We take each of the batches and do the forward pass. Then based on the error we back-propagate.

![alt text](https://www.researchgate.net/profile/Junita_Mohamad-Saleh/publication/257071174/figure/fig3/AS:297526545666050@1447947264431/A-schematic-diagram-of-a-Multi-Layer-Perceptron-MLP-neural-network.png "MLP with 3-layers")


Assume, batch_size = 1, matrix multiplication $*$ and element-wise multiplication $.$

### Mean-Squared Loss Function:

$L = 0.5*(output - true\_output)^2$

### Forward Pass:

$Z = \sigma(W_1^{T}X)$           [$\mathbb{R}^{1 x 100}$]

$output = \sigma(W_2^{T}Z)$       [$\mathbb{R}^{1 x 10}$]

### Backward Pass:

Derivative of loss: $diff = (output - true\_output)$   [$\mathbb{R}^{1 x 10}$]

$\frac{\partial L}{\partial W_2} = Z^{T}*(diff.\sigma^{'}(output))$    [$\mathbb{R}^{100 x 10}$]

$\frac{\partial L}{\partial W_1} = X^{T} *((diff.\sigma^{'}(output))*W_2^{T}).\sigma^{'}(Z)$ [$\mathbb{R}^{784 x 100}$]

### Parameter Update:

$W_1 = W_1 - \eta \frac{\partial L}{\partial W_1}$

$W_2 = W_2 - \eta \frac{\partial L}{\partial W_2}$

In [0]:
for epoch in range(0, num_epochs):
    loss = 0
    accuracy_metric = Metrics()

    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        # Forward Pass
        x_batch = x_batch.view(-1, 784)
        hidden_state_output = sigmoid(torch.mm(x_batch, W_1))
        output = sigmoid(torch.mm(hidden_state_output, W_2))

        # Convert the labels to one hot encoded format
        y_batch_onehot = one_hot(y_batch, 10)

        # Loss (Mean-Squared error)
        loss = (output - y_batch_onehot).pow(2).sum() * 0.5

        # Backward Pass (Back-Propagation)
        # Derivative of MSE Loss
        diff = (output - y_batch_onehot)

        grad_w2 = torch.mm(hidden_state_output.t(), torch.mul(diff, sigmoid_diff(output)))  # 100 x 10 dimensional
        grad_w1 = torch.mm(x_batch.t(), torch.mul(torch.mm(torch.mul(diff, sigmoid_diff(output)), W_2.t())
                                                  , sigmoid_diff(hidden_state_output)))  # 784 x 100

        # Perform gradient descent
        W_1 -= learning_rate * grad_w1
        W_2 -= learning_rate * grad_w2

        accuracy = compute_accuracy(output, y_batch)
        accuracy_metric.update(accuracy[0])

        if batch_idx % 200 == 0:
            print("Epoch: {0} \t|\t loss: {1} \t|\t accuracy: {2}".format(epoch, loss, accuracy_metric.avg))

## Using in-built Autograd function

loss.backward():  calculates the gradients of the loss function w.r.t all the parameters in the network

In [0]:
learning_rate = 0.1

W_1_ag = torch.randn(input_dim, num_of_hidden_nodes, requires_grad=True)
W_2_ag = torch.randn(num_of_hidden_nodes, output_dim, requires_grad=True)

for epoch in range(0, num_epochs):

    correct = 0
    accuracy_metric = Metrics()

    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):

        x_batch = x_batch.view(-1,784)

        # Forward Pass
        hidden_state_output = torch.sigmoid(torch.mm(x_batch, W_1_ag))
        output = torch.sigmoid(torch.mm(hidden_state_output, W_2_ag))

        # Convert the labels to one hot encoded format
        y_batch_onehot = one_hot(y_batch, 10)

        # Loss (Mean-Squared error)
        loss = (output - y_batch_onehot).pow(2).sum().mul(0.5)
        loss.backward()

        W_1_ag.data -= learning_rate * W_1_ag.grad.data
        W_2_ag.data -= learning_rate * W_2_ag.grad.data

        # Manually zero the gradients before running the backward pass
        W_1_ag.grad.data.zero_()
        W_2_ag.grad.data.zero_()

        accuracy = compute_accuracy(output, y_batch)
        accuracy_metric.update(accuracy[0])

        if batch_idx % 200 == 0:
            print("Epoch: {0} \t|\t loss: {1} \t|\t accuracy: {2}".format(epoch, loss ,accuracy_metric.avg))
