# Lab 9_2 MNIST with neural network

Solve MNIST with multiple layers of neurons to enhance performance.

In [1]:
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import random

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# for reproducibility
random.seed(111)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [3]:
# parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100

In [4]:
# MNIST dataset
mnist_train = dsets.MNIST(root='MNIST_data/',
                          train=True,
                          transform=transforms.ToTensor(),
                          download=True)

mnist_test = dsets.MNIST(root='MNIST_data/',
                         train=False,
                         transform=transforms.ToTensor(),
                         download=True)

In [5]:
# dataset loader
data_loader = torch.utils.data.DataLoader(dataset=mnist_train,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          drop_last=True)

In [6]:
# nn layers
linear1 = torch.nn.Linear(784, 256, bias=True)
linear2 = torch.nn.Linear(256, 256, bias=True)
linear3 = torch.nn.Linear(256, 10, bias=True)
relu = torch.nn.ReLU()

`torch.nn.ReLU` is a class in the PyTorch library that applies the Rectified Linear Unit (ReLU) activation function to the input data. The ReLU activation function is defined as:

$ \text{ReLU}(x) = \max(0, x) $

This means that it outputs the input directly if it is positive; otherwise, it outputs zero. ReLU is widely used in neural networks because it helps to introduce non-linearity into the model while being computationally efficient.


In [7]:
# Initialization
torch.nn.init.normal_(linear1.weight)
torch.nn.init.normal_(linear2.weight)
torch.nn.init.normal_(linear3.weight)

Parameter containing:
tensor([[-0.6645,  0.6646,  0.7159,  ..., -0.3040, -0.8945, -0.7977],
        [-1.2045,  2.4545, -1.8073,  ..., -1.7139, -0.0451,  0.3864],
        [ 0.3414,  0.3114, -1.9218,  ..., -0.4525, -0.6849,  0.9663],
        ...,
        [ 1.3915, -0.4048,  0.2338,  ..., -0.8257, -0.8397,  1.9816],
        [ 1.0414,  0.2130, -0.0417,  ...,  1.7541, -0.6454, -0.0821],
        [-0.1162, -1.2692,  0.8201,  ..., -0.8303, -0.5022,  0.0583]],
       requires_grad=True)

The weights determine how the outputs of the previous neuron are combined to produce the final output.

This code initializes the weights of three linear layers (linear1, linear2, and linear3) using the normal_ function from the torch.nn.init module. The normal_ function fills the weights with values drawn from a normal (Gaussian) distribution.

The point of initializing the weights in this manner is to ensure that the neural network starts with weights that are not too large or too small, and break the symmetry, which can help with the convergence of the training process.

In [8]:
# model
model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)

In [9]:
# define cost/loss & optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
total_batch = len(data_loader)
for epoch in range(training_epochs):
    avg_cost = 0

    for X, Y in data_loader:
        # reshape input image into [batch_size by 784]
        # label is not one-hot encoded
        X = X.view(-1, 28 * 28).to(device)
        Y = Y.to(device)

        optimizer.zero_grad()
        hypothesis = model(X)
        cost = criterion(hypothesis, Y)
        cost.backward()
        optimizer.step()

        avg_cost += cost / total_batch

    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

print('Learning finished')

Epoch: 0001 cost = 128.811523438
Epoch: 0002 cost = 35.830200195
Epoch: 0003 cost = 23.081176758
Epoch: 0004 cost = 16.084297180
Epoch: 0005 cost = 11.661833763
Epoch: 0006 cost = 8.553230286
Epoch: 0007 cost = 6.407429218
Epoch: 0008 cost = 4.845437527
Epoch: 0009 cost = 3.704708338
Epoch: 0010 cost = 2.775542498
Epoch: 0011 cost = 2.172554731
Epoch: 0012 cost = 1.680796385
Epoch: 0013 cost = 1.240631223
Epoch: 0014 cost = 1.162401319
Epoch: 0015 cost = 0.868149042
Learning finished


In [15]:
# Test the model using test sets
with torch.no_grad():
    X_test = mnist_test.data.view(-1, 28 * 28).float().to(device)
    Y_test = mnist_test.targets.to(device)

    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print('Accuracy:', accuracy.item())

    # Get one and predict
    r = random.randint(0, len(mnist_test) - 1)
    X_single_data = mnist_test.data[r:r + 1].view(-1, 28 * 28).float().to(device)
    Y_single_data = mnist_test.targets[r:r + 1].to(device)

    print('Label: ', Y_single_data.item())
    single_prediction = model(X_single_data)
    print('Prediction: ', torch.argmax(single_prediction, 1).item())

Accuracy: 0.9472000002861023
Label:  6
Prediction:  6
