In [1]:
# import necessary modules
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch import cuda

from torchvision import datasets
from torchvision import transforms

import matplotlib.pyplot as plt

In [2]:
# load or download MNIST datasets
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.7,), (0.7,)),]) # images to tensors with normalization
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=trans)
test_data = datasets.MNIST(root='./data', train=False, download=True, transform=trans)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value='')))


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value='')))


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value='')))


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value='')))


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [3]:
# create data loaders
batch_size = 64
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [15]:
# check the data from the data loader
for X, y in test_dataloader:
    print(f'Shape of X [batch, channel, height, width]: {X.shape}')
    print(f'Shape of y: {y.shape} -> ({y.dtype})')
    break

Shape of X [batch, channel, height, width]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) -> (torch.int64)


In [5]:
# define MLP network
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.input_layer = nn.Linear(28*28, 256)
        self.relu1 = nn.ReLU()
        self.hidden_layer = nn.Linear(256, 512)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(512, 10)
        # self.linear_relu_layers = nn.Sequential(nn.Linear(28*28, 256),
        #                                         nn.ReLU(),
        #                                         nn.Linear(256, 512),
        #                                         nn.ReLU(),
        #                                         nn.Linear(512, 10))
    
    def forward(self, x):
        x = self.flatten(x)
        h = self.relu1(self.input_layer(x))
        h = self.relu2(self.hidden_layer(h))
        y = self.output_layer(h)
        # y = self.linear_relu_layers(x)
        return y

In [None]:
# define MLP network
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_layers = nn.Sequential(nn.Linear(28*28, 256),
                                                nn.ReLU(),
                                                nn.Linear(256, 512),
                                                nn.ReLU(),
                                                nn.Linear(512, 10))
    
    def forward(self, x):
        return self.linear_relu_layers(self.flatten(x))

In [6]:
# check device for training
device = 'cuda' if cuda.is_available() else 'cpu'
print(f'device: {device}')

device: cuda


In [7]:
# get a MLP model and send it to device
myMLP = MLP().to(device)
print(myMLP)

MLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (input_layer): Linear(in_features=784, out_features=256, bias=True)
  (relu1): ReLU()
  (hidden_layer): Linear(in_features=256, out_features=512, bias=True)
  (relu2): ReLU()
  (output_layer): Linear(in_features=512, out_features=10, bias=True)
)


In [8]:
# define a loss function and an optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(myMLP.parameters(), lr=5e-3)

In [11]:
# train the MLP model
epochs = 5
ndata = len(train_dataloader.dataset)
print('training starts!')

for e in range(epochs):
    print(f'\nepoch {e+1}\n------------------------------')
    myMLP.train() # train mode
    
    for nbatch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device) # tensors to device(gpu)

        prediction = myMLP(X)
        train_loss = loss_fn(prediction, y) # calculate the loss of the prediction on the batch

        optimizer.zero_grad() # clear gradients
        train_loss.backward() # backpropagation
        optimizer.step()

        if nbatch % 100 == 0: # track the training
            train_loss, current = train_loss.item(), nbatch * len(X)
            print(f'loss: {train_loss:>7f}  [{current:>5d}/{ndata:>5d}]')
print('\ntraining is finished!')

training starts!

epoch 1
------------------------------
loss: 0.258528  [    0/60000]
loss: 0.273169  [ 6400/60000]
loss: 0.259676  [12800/60000]
loss: 0.361321  [19200/60000]
loss: 0.263151  [25600/60000]
loss: 0.337552  [32000/60000]
loss: 0.229728  [38400/60000]
loss: 0.393709  [44800/60000]
loss: 0.358780  [51200/60000]
loss: 0.423601  [57600/60000]

epoch 2
------------------------------
loss: 0.232001  [    0/60000]
loss: 0.266170  [ 6400/60000]
loss: 0.243248  [12800/60000]
loss: 0.349420  [19200/60000]
loss: 0.245207  [25600/60000]
loss: 0.325074  [32000/60000]
loss: 0.218572  [38400/60000]
loss: 0.379761  [44800/60000]
loss: 0.337337  [51200/60000]
loss: 0.414267  [57600/60000]

epoch 3
------------------------------
loss: 0.210979  [    0/60000]
loss: 0.260296  [ 6400/60000]
loss: 0.231741  [12800/60000]
loss: 0.338896  [19200/60000]
loss: 0.231851  [25600/60000]
loss: 0.312602  [32000/60000]
loss: 0.209266  [38400/60000]
loss: 0.371084  [44800/60000]
loss: 0.319284  [51200/

In [10]:
# test the MLP model
ndata = len(test_dataloader.dataset)
nbatch = len(test_dataloader)
myMLP.eval() # test mode
test_loss, correct = 0, 0 # initialization

with torch.no_grad():
    for X, y in test_dataloader:
        X, y = X.to(device), y.to(device)
        prediction = myMLP(X)
        test_loss += loss_fn(prediction, y).item()
        correct += (prediction.argmax(1) == y).type(torch.float).sum().item() # calculate and accumulate the loss
test_loss /= nbatch # average loss for all batches
correct /= ndata # accuracy for all data
print(f"test error\n-> accuracy: {(100*correct):>0.1f}%, average loss: {test_loss:>8f}")

test error
-> accuracy: 90.5%, average loss: 0.324948
