#### Project: A multilayer perceptron for multi-class classfication and Applying Droupout Regularization technique.

Dropout: This is a regularization techniques used to improve an overfitting model(high variance) by droping off some nodes and links that doesn't participate in the training rocess then taining the model with the remaining hidden nodes and node units.

#### Packages selection
- The first things is to import all the neccesary packages needed for this project

In [4]:
import time
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F

# select GPU when cuda is available
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

#### Settings
- Configure the device
- define all the hyperparameters to be used and needs to be tuned to achive a better accuracy
- Load and explore the data

In [13]:
# device
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

# Hyperparameters
random_seed = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 64
dropout_prob = 0.5

# Model Architecture parameters
num_features = 784
num_hidden_1 = 128
num_hidden_2 = 256
num_classes = 10

# dataset -> MNIST
# Note: transform.ToTensor() scale image image to 0-1 range

train_dataset = datasets.MNIST(root='data',
                              train=True,
                              transform=transforms.ToTensor(),
                              download=True)

test_dataset = datasets.MNIST(root='data',
                             train=False,
                             transform=transforms.ToTensor())

train_loader = DataLoader(dataset=train_dataset,
                         batch_size=batch_size,
                         shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                        batch_size=batch_size,
                        shuffle=False)

# check the dataset
for images, labels in train_loader:
    print("Image batch dimension", images.shape)
    print("Image label dimension", labels.shape)
    break

Image batch dimension torch.Size([64, 1, 28, 28])
Image label dimension torch.Size([64])


#### Define the architecture of the model such as
- The number of input layers; which is determined by the features of the data
- Number of total hidden layers in the model (iterative)
of hidden units in each layers (iterative)
- The output layer node units is determined by the intended outcome to achieve
- Here: we build a 3 layers multilayer perceptron i.e 2 hidden layers and 1 output layer
- Note: We don't count the input layer as part of the layers.

In [14]:
"""
Architecture:
X -> Linear -> Relu -> dropout -> Linear -> Relu -> dropout -> Linear -> Softmax Layer -> y
"""

class MultiLayerPerceptron(nn.Module):
    def __init__(self, num_features, num_classes):
        super(MultiLayerPerceptron, self).__init__()
        
        # 1st hidden layer
        self.linear_1 = torch.nn.Linear(num_features, num_hidden_1)
        
        # 2nd hidden layer
        self.linear_2 = torch.nn.Linear(num_hidden_1, num_hidden_2)
        
        # output layer
        self.linear_out = torch.nn.Linear(num_hidden_2, num_classes)
        
    def forward(self, x):
        """
        Link all the layers together
        """
        out = self.linear_1(x)
        out = F.relu(out)
        out = F.dropout(out, p=dropout_prob, training=self.training)
        
        out = self.linear_2(out)
        out = F.relu(out)
        out = F.dropout(out, p=dropout_prob, training=self.training)
        
        outputs = self.linear_out(out)
        probas = F.softmax(outputs, dim=1)
        return outputs, probas
    

#### Loss function and optimizer
- Instantiate the model
- define the specific Loss function to be used either cross entropy, MSELoss, etc
- define the optimization algorithm to be used either SGD, Adam, RMSprop, Momentum etc.

In [15]:
torch.manual_seed(random_seed)
model = MultiLayerPerceptron(num_features, num_classes)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#### compute accuracy
- A function to compute train and test accuracy

In [16]:
def compute_accuracy(model, data_loader):
    model.eval()
    correct_prediction, num_examples = 0, 0
    with torch.no_grad():
        for features, labels in data_loader:
            features = features.view(-1, 28*28).to(device)
            labels = labels.to(device)
            outputs, probas = model(features)
            _, predicted_labels = torch.max(probas, 1)
            num_examples +=labels.size(0)
            correct_prediction += (predicted_labels == labels).sum()
        return correct_prediction.float() / num_examples * 100

#### Training a model requires the following steps¶
- Reset all the gradients to zero (0)
- Make a forward pass (make a prediction)
- Calculate the loss
- Perform back propagation
- Update all the parameters (weight and biases)

In [17]:
start_time = time.time()
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward and Back Pass
        outputs, probas = model(images)
        loss = F.cross_entropy(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Logging
        if not i % 50:
            print('Epoch: %03d/%03d | Batch: %03d/%03d | Cost: %.4f'
                 %(epoch+1, num_epochs, i, total_step, loss))
    print('Epoch: %03d/%03d training accuracy: %.2f%%' % (
    epoch+1, num_epochs, compute_accuracy(model, train_loader)))
    
    print('Time elapsed: %.2f min ' % ((time.time() - start_time) / 60))
    
print('Total training Time: %.2f min' % ((time.time() - start_time) / 60))

Epoch: 001/010 | Batch: 000/938 | Cost: 2.3000
Epoch: 001/010 | Batch: 050/938 | Cost: 0.9042
Epoch: 001/010 | Batch: 100/938 | Cost: 0.5057
Epoch: 001/010 | Batch: 150/938 | Cost: 0.5168
Epoch: 001/010 | Batch: 200/938 | Cost: 0.3971
Epoch: 001/010 | Batch: 250/938 | Cost: 0.3894
Epoch: 001/010 | Batch: 300/938 | Cost: 0.4434
Epoch: 001/010 | Batch: 350/938 | Cost: 0.1512
Epoch: 001/010 | Batch: 400/938 | Cost: 0.2976
Epoch: 001/010 | Batch: 450/938 | Cost: 0.2695
Epoch: 001/010 | Batch: 500/938 | Cost: 0.3153
Epoch: 001/010 | Batch: 550/938 | Cost: 0.2600
Epoch: 001/010 | Batch: 600/938 | Cost: 0.1263
Epoch: 001/010 | Batch: 650/938 | Cost: 0.4475
Epoch: 001/010 | Batch: 700/938 | Cost: 0.3200
Epoch: 001/010 | Batch: 750/938 | Cost: 0.4549
Epoch: 001/010 | Batch: 800/938 | Cost: 0.1912
Epoch: 001/010 | Batch: 850/938 | Cost: 0.1499
Epoch: 001/010 | Batch: 900/938 | Cost: 0.2598
Epoch: 001/010 training accuracy: 94.60%
Time elapsed: 0.34 min 
Epoch: 002/010 | Batch: 000/938 | Cost: 0.

Epoch: 009/010 | Batch: 600/938 | Cost: 0.0029
Epoch: 009/010 | Batch: 650/938 | Cost: 0.0098
Epoch: 009/010 | Batch: 700/938 | Cost: 0.0561
Epoch: 009/010 | Batch: 750/938 | Cost: 0.0151
Epoch: 009/010 | Batch: 800/938 | Cost: 0.0034
Epoch: 009/010 | Batch: 850/938 | Cost: 0.0431
Epoch: 009/010 | Batch: 900/938 | Cost: 0.0694
Epoch: 009/010 training accuracy: 99.29%
Time elapsed: 3.55 min 
Epoch: 010/010 | Batch: 000/938 | Cost: 0.0041
Epoch: 010/010 | Batch: 050/938 | Cost: 0.0072
Epoch: 010/010 | Batch: 100/938 | Cost: 0.0072
Epoch: 010/010 | Batch: 150/938 | Cost: 0.0057
Epoch: 010/010 | Batch: 200/938 | Cost: 0.0104
Epoch: 010/010 | Batch: 250/938 | Cost: 0.0013
Epoch: 010/010 | Batch: 300/938 | Cost: 0.0008
Epoch: 010/010 | Batch: 350/938 | Cost: 0.0028
Epoch: 010/010 | Batch: 400/938 | Cost: 0.0012
Epoch: 010/010 | Batch: 450/938 | Cost: 0.0165
Epoch: 010/010 | Batch: 500/938 | Cost: 0.0020
Epoch: 010/010 | Batch: 550/938 | Cost: 0.0203
Epoch: 010/010 | Batch: 600/938 | Cost: 0.

### Testing/Evaluation

In [18]:
# print the test accuracy
print("Test Accuracy: %.2f%%" % (compute_accuracy(model, test_loader)))

Test Accuracy: 97.72%
