# Multilayer Logistic Regression

In [1]:
# Imports 
import numpy as np
import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
from tqdm.notebook import tqdm

In [2]:
# Load the data
mnist_train = datasets.MNIST(root="./datasets", train=True, transform=transforms.ToTensor(), download=True)
mnist_test = datasets.MNIST(root="./datasets", train=False, transform=transforms.ToTensor(), download=True)
train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=100, shuffle=False)

# By shuffling the training data, the model is forced to learn the underlying patterns in the data rather than memorizing the order of the samples
# Test set should remain in its original order to provide a consistent and reliable evaluation of the model's performance

In [3]:
# Training 
# Initialize parameters
W1 = torch.randn(784, 500) / np.sqrt(784) # First layer weights
W1.requires_grad_()
b1 = torch.zeros(500, requires_grad=True) # First layer biases

W2 = torch.randn(500, 10) / np.sqrt(500) # Second layer weights
W2.requires_grad_()
b2 = torch.zeros(10, requires_grad=True) # Second layer biases

In [4]:
# Optimizer
optimizer = torch.optim.SGD([W1, b1, W2, b2], lr=0.1)

In [5]:
# Iterate through train set minibatches
for images, labels in tqdm(train_loader):
    # Zero out the gradients
    optimizer.zero_grad()
    
    # Forward pass
    x = images.view(-1, 28 * 28)
    h1 = torch.matmul(x, W1) + b1 # First layer output
    x1 = F.relu(h1) # Apply non-linear activation (ReLU)
    y = torch.matmul(x1, W2) + b2 # Second layer output
    
    cross_entropy = F.cross_entropy(y, labels)
    
    # Backward Pass
    cross_entropy.backward()
    optimizer.step()

  0%|          | 0/600 [00:00<?, ?it/s]

In [6]:
# Testing
correct = 0
total = len(mnist_test)
with torch.no_grad():
    # Iterate through test set minibatchs
    for images, labels in tqdm(test_loader):
        # Forward pass
        x = images.view(-1, 28 * 28)
        h1 = torch.matmul(x, W1) + b1
        x1 = F.relu(h1)
        y = torch.matmul(x1, W2) + b2
        predictions = torch.argmax(y, dim=1)
        correct += torch.sum((predictions == labels).float())
        
print('Test accuracy: {}'.format(correct / total))

  0%|          | 0/100 [00:00<?, ?it/s]

Test accuracy: 0.9230999946594238


Test accuracy for 784->256->10: 0.9194999933242798
Test accuracy for 784->500->10: 0.9230999946594238

Parameters (784->256->10): [784 * 256]Layer1 + 256Bias1 + [256 * 10] + 10Bias2 = 203,530
Parameters (784->500->10): [784 * 500]Layer1 + 500Bias1 + [500 * 10] + 10Bias2 = 397,510