# Implementation of Multilayer Perceptron from Scratch

In [2]:
import torch
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

batch_size = 256

# Load Fashion MNIST dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_iter = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_iter = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 4.2.1 Initializing Model Parameters

Recall that Fashion-MNIST contains 10 classes, and that each image consists of a 28x28=784
 grid of grayscale pixel values. Again, we will disregard the spatial structure among the pixels for now, so we can think of this as simply a classification dataset with 784 input features and 10 classes. To begin, we will implement an MLP with one hidden layer and 256 hidden units. Note that we can regard both of these quantities as hyperparameters. Typically, we choose layer widths in powers of 2, which tend to be computationally efficient because of how memory is allocated and addressed in hardware.

Again, we will represent our parameters with several tensors. Note that for every layer, we must keep track of one weight matrix and one bias vector. As always, we allocate memory for the gradients of the loss with respect to these parameters

In [3]:
num_inputs, num_outputs = 784, 10
num_hiddens1 = 256
num_hiddens2 = 128   # new second hidden layer

# Layer 1: Input → Hidden1
W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens1) * 0.01)
b1 = nn.Parameter(torch.zeros(num_hiddens1))

# Layer 2: Hidden1 → Hidden2
W2 = nn.Parameter(torch.randn(num_hiddens1, num_hiddens2) * 0.01)
b2 = nn.Parameter(torch.zeros(num_hiddens2))

# Layer 3: Hidden2 → Output
W3 = nn.Parameter(torch.randn(num_hiddens2, num_outputs) * 0.01)
b3 = nn.Parameter(torch.zeros(num_outputs))

params = [W1, b1, W2, b2, W3, b3]


# Activation Function

In [4]:
def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X, a)

# 4.2.3 Model

In [5]:
def net(X):
    X = X.reshape((-1, num_inputs))
    H = relu(X @ W1 + b1)  # Here '@' stands for matrix multiplication
    return (H @ W2 + b2)


# 4.2.4 Loss Function

In [6]:

loss = nn.CrossEntropyLoss()
     


# 4.2.5 Training

In [7]:
num_epochs, lr = 10, 0.1
updater = torch.optim.SGD(params, lr=lr)

# Training loop
for epoch in range(num_epochs):
    train_loss_sum = 0
    train_correct = 0
    train_total = 0
    
    for X, y in train_iter:
        # Forward pass
        y_pred = net(X)
        l = loss(y_pred, y)
        
        # Backward pass
        updater.zero_grad()
        l.backward()
        updater.step()
        
        # Track metrics
        train_loss_sum += l.item()
        _, predicted = torch.max(y_pred.data, 1)
        train_total += y.size(0)
        train_correct += (predicted == y).sum().item()
    
    # Evaluate on test set
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for X, y in test_iter:
            y_pred = net(X)
            _, predicted = torch.max(y_pred.data, 1)
            test_total += y.size(0)
            test_correct += (predicted == y).sum().item()
    
    train_acc = train_correct / train_total
    test_acc = test_correct / test_total
    avg_loss = train_loss_sum / len(train_iter)
    
    print(f'Epoch {epoch + 1}/{num_epochs} | Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}')

Epoch 1/10 | Loss: 0.8689 | Train Acc: 0.7016 | Test Acc: 0.7876
Epoch 2/10 | Loss: 0.5015 | Train Acc: 0.8182 | Test Acc: 0.8248
Epoch 2/10 | Loss: 0.5015 | Train Acc: 0.8182 | Test Acc: 0.8248
Epoch 3/10 | Loss: 0.4519 | Train Acc: 0.8363 | Test Acc: 0.8282
Epoch 3/10 | Loss: 0.4519 | Train Acc: 0.8363 | Test Acc: 0.8282
Epoch 4/10 | Loss: 0.4152 | Train Acc: 0.8503 | Test Acc: 0.8322
Epoch 4/10 | Loss: 0.4152 | Train Acc: 0.8503 | Test Acc: 0.8322
Epoch 5/10 | Loss: 0.3902 | Train Acc: 0.8589 | Test Acc: 0.8496
Epoch 5/10 | Loss: 0.3902 | Train Acc: 0.8589 | Test Acc: 0.8496
Epoch 6/10 | Loss: 0.3748 | Train Acc: 0.8654 | Test Acc: 0.8535
Epoch 6/10 | Loss: 0.3748 | Train Acc: 0.8654 | Test Acc: 0.8535
Epoch 7/10 | Loss: 0.3622 | Train Acc: 0.8691 | Test Acc: 0.8529
Epoch 7/10 | Loss: 0.3622 | Train Acc: 0.8691 | Test Acc: 0.8529
Epoch 8/10 | Loss: 0.3490 | Train Acc: 0.8747 | Test Acc: 0.8414
Epoch 8/10 | Loss: 0.3490 | Train Acc: 0.8747 | Test Acc: 0.8414
Epoch 9/10 | Loss: 0.3417