# Implementing multilayer perceptron (MLP) for MNIST

In [107]:
# import useful packages 
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt 
import time 
import utils 
import numpy as np

# torch version 
print(torch.__version__)

1.1.0


## Download the data 

In [123]:
# train set 
train_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor()), batch_size=6000, shuffle=True)
# test set
test_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor()), batch_size=1000)

## Visualize the data 

In [126]:
# extract data from the loader 
train_data = enumerate(train_loader)
batch_idx, (train_set, train_targets) = next(train_data)
test_data = enumerate(test_loader)
batch_idx, (test_set, test_targets) = next(test_data)

# plot the first 4 images 
images = plt.figure()
for i in range(4):
    plt.subplot(2,2,i+1)
    plt.tight_layout()
    plt.imshow(train_set[i][0], cmap='gray', interpolation='none')

images     

print('train size = ' + str(train_set.size()))
print('test size = ' + str(test_set.size()))


train size = torch.Size([6000, 1, 28, 28])
test size = torch.Size([1000, 1, 28, 28])


AttributeError: 'module' object has no attribute 'to_rgba'

## Two layer network class 

In [142]:
# design the net 
class two_layers(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(two_layers , self).__init__()
        
        # layer 1 
        self.layer1 = nn.Linear(input_size, hidden_size, bias=True)
        # layer 2
        self.layer2 = nn.Linear(hidden_size, output_size, bias=True)
    
    # forward pass 
    def forward(self, x):
        y_hidden = self.layer1(x)        
        y = self.layer2(F.relu(y_hidden))
        
        return y 

In [143]:
# create one net 
net = two_layers(input_size=784, hidden_size=500, output_size=10)
print(net)
utils.display_num_param(net)

two_layers(
  (layer1): Linear(in_features=784, out_features=500, bias=True)
  (layer2): Linear(in_features=500, out_features=10, bias=True)
)
There are 397510 (0.40 million) parameters in this neural network


## Learning parameters: learning rate, batch size and optimizer: SGD, ...

In [154]:
# define the loss function, cross entropy in this case 
cross_entropy = nn.CrossEntropyLoss()
# setup the optimizer which is SGD with a learning rate lr 
optimizer=torch.optim.SGD(net.parameters(), lr=0.01 )
# bacth size 
bs = 20

## Evaluate error on the test set 

In [186]:
def error_on_test_set():
    current_error = 0
    num_batches = 0
    
    # evaluate error at every minibatch 
    for batch in range(0, 1000, bs):
        
        # extract test minibatch 
        test_mini_batch = test_set[batch:batch+bs]
        input_test = test_mini_batch.view(bs, 784)
        # extract corresponding labels 
        label_minibatch = test_targets[batch:batch+bs]
        # one forward pass over the network 
        y_hat = net(input_test)
        # compute error 
        error = utils.get_error(y_hat, label_minibatch)
        
        # update stats. 
        num_batches += 1
        current_error += error.item()
    
    avg_error = current_error / num_batches
    print('The error on test set = ' + str(avg_error*100) + '%')

## Training the network 

In [189]:
# We will train our network over 100 epochs
start = time.time()
for epoch in range(100):
    
    # keep track of the loss, error and the number of batches 
    current_loss  = 0
    current_error = 0
    num_batches = 0 
    
    # shuffle the data indices 
    shuffled_indices = torch.randperm(6000)
    for batch in range(0, 6000, bs):
        
        # extract mini-batches 
        shuffled_batch = shuffled_indices[batch:batch+bs]
        train_minibatch = train_set[shuffled_batch]
        label_minibatch = train_targets[shuffled_batch]
        
        # ---> forward pass 
        # reset the gradient 
        optimizer.zero_grad() 
        # view as a column vector 
        inputs = train_minibatch.view(bs, 784)
        # start recording gradient
        inputs.requires_grad_()
        # a forward pass on the network 
        y_hat = net(inputs)
        # compute the cross entropy loss 
        loss = cross_entropy(y_hat, label_minibatch)
        # <--- back propagation
        loss.backward()
        # update the net parameters 
        optimizer.step() 
        # get the current error 
        error = utils.get_error( y_hat.detach() , label_minibatch)
        
        # update the running stats. 
        num_batches += 1 
        current_loss += loss.detach().item()
        current_error += error.item()
     
    # average loss/error over minibatches for the current epoch 
    avg_loss = current_loss / num_batches
    avg_error = current_error / num_batches
    elapsed_time = time.time() - start 

    # every 10 epochs display stats.
    if epoch % 10 == 0: 
        print('The loss for epoch number ' + str(epoch) + ' = ' + str(avg_loss))
        print('The error for epoch number ' + str(epoch) + ' = ' + str(avg_error))

        # evaluate error on test set
        error_on_test_set()

The loss for epoch number 0 = 0.00865451056627
The error for epoch number 0 = 0.0
The error on test set = 5.90000092983%
The loss for epoch number 10 = 0.00785449115094
The error for epoch number 10 = 0.0
The error on test set = 5.90000092983%
The loss for epoch number 20 = 0.00718958249529
The error for epoch number 20 = 0.0
The error on test set = 5.70000088215%
The loss for epoch number 30 = 0.00663072751951
The error for epoch number 30 = 0.0
The error on test set = 5.70000088215%
The loss for epoch number 40 = 0.00614241601123
The error for epoch number 40 = 0.0
The error on test set = 5.70000088215%
The loss for epoch number 50 = 0.00572464592716
The error for epoch number 50 = 0.0
The error on test set = 5.80000090599%
The loss for epoch number 60 = 0.00534556960347
The error for epoch number 60 = 0.0
The error on test set = 5.90000081062%
The loss for epoch number 70 = 0.00502621515712
The error for epoch number 70 = 0.0
The error on test set = 5.80000090599%
The loss for epoch