In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3060 Laptop GPU
Using device: cuda


In [2]:
#Original source: https://www.kaggle.com/code/hojjatk/read-mnist-dataset
#It has been modified for ease of use w/ pytorch

#You do NOT need to modify ANY code in this file!

import numpy as np
import struct
from array import array
import torch

class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        n = 60000 if "train" in images_filepath else 10000
        labels = torch.zeros((n, 10))
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            l = torch.tensor(array("B", file.read())).unsqueeze(-1)
            l = torch.concatenate((torch.arange(0, n).unsqueeze(-1), l), dim = 1).type(torch.int32)
            labels[l[:,0], l[:,1]] = 1

        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())
        images = torch.zeros((n, 28**2))
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            #img = img.reshape(28, 28)
            images[i, :] = torch.tensor(img)

        return images, labels

    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)

In [3]:
import torch

class ReLU:
    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Applies the ReLU activation function.
        ReLU(x) = max(0, x)
        """
        return torch.maximum(torch.zeros_like(x), x)

    def backward(self, delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        """
        Computes the gradient of ReLU.
        ReLU'(x) = 1 if x > 0 else 0
        """
        return delta * (x > 0).float()


class LeakyReLU:
    def __init__(self, alpha=0.1):
        """
        Initializes the LeakyReLU activation function with a specified alpha value.
        """
        self.alpha = alpha

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Applies the Leaky ReLU activation function.
        LeakyReLU(x) = x if x > 0 else alpha * x
        """
        return torch.where(x >= 0, x, self.alpha * x)

    def backward(self, delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        """
        Computes the gradient of Leaky ReLU.
        LeakyReLU'(x) = 1 if x > 0 else alpha
        """
        return delta * torch.where(x >= 0, torch.ones_like(x), self.alpha * torch.ones_like(x))


In [7]:
import torch
import numpy as np
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class MLP:
    '''
    Multi-Layer Perceptron (MLP) for MNIST classification.
    Implements forward propagation, backpropagation, and training.
    '''
    
    def __init__(self, layer_sizes: list[int]):
        self.layer_sizes: list[int] = layer_sizes
        self.num_layers = len(layer_sizes) - 1
        self.weights: list[torch.tensor] = []
        self.biases: list[torch.tensor] = []
        self.features: list[torch.tensor] = []  

        self.learning_rate: float = 1
        self.batch_size: int = 1
        self.activation_function: callable[[torch.tensor], torch.tensor] = ReLU
        self.lambda_l2: float = 0

    def set_hp(self, lr: float, bs: int, activation: object, l2:float) -> None:
        """
        Set hyperparameters for training.
        """
        self.learning_rate = lr
        self.batch_size = bs
        self.activation_function = activation()
        self.lambda_l2 = l2

    def initialize(self) -> None:
        """
        Initialize all biases to zero and weights using Xavier initialization.
        """
        for i in range(self.num_layers):
            d_in = self.layer_sizes[i]
            d_out = self.layer_sizes[i + 1]
            w_range = np.sqrt(6 / (d_in + d_out))
            W = torch.empty(d_in, d_out, device=device).uniform_(-w_range, w_range)
            self.weights.append(W)
            b = torch.zeros(1, d_out, device=device) 
            self.biases.append(b)
            

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Forward propagation through all layers.
        Applies activation function to all layers except the last one.
        """
        self.features = [x.to(device)]  

        for i in range(self.num_layers):  
            x = torch.matmul(x, self.weights[i]) + self.biases[i]
            x = self.activation_function.forward(x)  
            self.features.append(x) 
        return x
    
    def backward(self, delta: torch.Tensor) -> None:
        '''
        This function should backpropagate the provided delta through the entire MLP, and update the weights according to the hyper-parameters
        stored in the class variables.
        '''
        # back propogation starts from the result
        for i in reversed(range(self.num_layers)):
            x = self.features[i]

            delta = self.activation_function.backward(delta,self.features[i+1])
            # Computing gradients
            dW = torch.matmul(x.T,delta) / self.batch_size + (self.lambda_l2 / self.batch_size) * self.weights[i]
            db = torch.sum(delta, dim=0, keepdim=True) / self.batch_size

            # Updating weights and biases with learning rate
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
            delta = torch.matmul(delta,self.weights[i].T)

    # def backward(self, delta: torch.tensor) -> None:
    #     """
    #     Backpropagation through all layers to compute gradients.
    #     Updates weights using gradient descent.
    #     """
    #     # grad_weights = [torch.zeros_like(w) for w in self.weights]
    #     # grad_biases = [torch.zeros_like(b) for b in self.biases]

    #     for i in reversed(range(self.num_layers)):  
    #         X = self.features[i]
    #         dW = torch.matmul(X.T, delta) / self.batch_size  + (self.lambda_l2 / self.batch_size) * self.weights[i]
    #         db = torch.sum(delta,dim=0,keepdim=True) / self.batch_size
            
    #         self.weights[i] -= self.learning_rate * dW
    #         self.biases[i] -= self.learning_rate * db

    #         # if i > 0:
    #         #     delta = (delta @ self.weights[i].T)
    #         #     if i > 1:
    #         #         delta *= self.activation_function.backward(delta,self.features[i-1])

    #         delta = torch.matmul(delta, self.weights[i].T) * self.activation_function.backward(torch.ones_like(X), X)



def TrainMLP(model: MLP, x_train: torch.tensor, y_train: torch.tensor) -> MLP:
    """
    Train the MLP for one epoch using mini-batch gradient descent with GPU support.
    """
    bs = model.batch_size
    N = x_train.shape[0]
    rng = np.random.default_rng()
    idx = rng.permutation(N)
    lambda_l2 = model.lambda_l2

    L = 0  

    for i in tqdm.tqdm(range(N // bs)):
        x = x_train[idx[i * bs:(i + 1) * bs], ...].to(device)
        y = y_train[idx[i * bs:(i + 1) * bs], ...].to(device)

        
        y_hat = model.forward(x)

        
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim=1, keepdim=True)

        # l2 regularisation
        l = -1 * torch.sum(y * torch.log(p))
        l2_penalty = (model.lambda_l2 / 2) * sum(torch.sum(w**2) for w in model.weights)
        l += l2_penalty
        L += l

        delta = p - y
        model.backward(delta)

    print("Train Loss:", L / ((N // bs) * bs))



def TestMLP(model: MLP, x_test: torch.tensor, y_test: torch.tensor) -> tuple[float, float]:
    """
    Evaluate the MLP on test data using GPU support.
    """
    bs = model.batch_size
    N = x_test.shape[0]

    rng = np.random.default_rng()
    idx = rng.permutation(N)

    L = 0
    A = 0

    for i in tqdm.tqdm(range(N // bs)):
        x = x_test[idx[i * bs:(i + 1) * bs], ...].to(device)
        y = y_test[idx[i * bs:(i + 1) * bs], ...].to(device)

        y_hat = model.forward(x)

        
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim=1, keepdim=True)

        
        l = -1 * torch.sum(y * torch.log(p))
        L += l

        
        A += torch.sum(torch.where(torch.argmax(p, dim = 1) == torch.argmax(y, dim = 1), 1, 0))

    test_loss = L / ((N // bs) * bs)
    test_accuracy = 100 * A / ((N // bs) * bs)

    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy:.2f}%")

    return test_loss, test_accuracy  


def normalize_mnist() -> tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
    '''
    This function loads the MNIST dataset, then normalizes the "X" values to have zero mean, unit variance.
    '''

    #IMPORTANT!!!#
    #UPDATE THE PATH BELOW!#
    base_path = "C:\\Users\\yoges\\Data_Science_Preparation\\CSCI 5922 Neural Networks and Deep Learning\\Lab Assignments\\Lab1Code\\MNIST\\"
    #^^^^^^^^#


    mnist = MnistDataloader(base_path + "train-images.idx3-ubyte", base_path + "train-labels.idx1-ubyte",
                            base_path + "t10k-images.idx3-ubyte", base_path + "t10k-labels.idx1-ubyte")
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_mean = torch.mean(x_train, dim = 0, keepdim = True)
    x_std = torch.std(x_train, dim = 0, keepdim = True)

    x_train -= x_mean
    x_train /= x_std
    x_train[x_train != x_train] = 0

    x_test -= x_mean
    x_test /= x_std
    x_test[x_test != x_test] = 0


    return x_train, y_train, x_test, y_test

def main():
    """
    Main function to train and evaluate the MLP model on MNIST using GPU.
    """
    x_train, y_train, x_test, y_test = normalize_mnist()

   
    model = MLP([784, 256, 10])  
    model.initialize()
    model.set_hp(lr=1e-3, bs=512, activation=ReLU, l2 = 0.001)  
    
    E = 25
    for _ in range(E):
        TrainMLP(model, x_train, y_train)
        TestMLP(model, x_test, y_test)


if __name__ == "__main__":
    main()


Using device: cuda


100%|██████████| 117/117 [00:00<00:00, 218.30it/s]


Train Loss: tensor(2.3631, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 452.42it/s]


Test Loss: nan, Test Accuracy: 27.05%


100%|██████████| 117/117 [00:00<00:00, 352.68it/s]


Train Loss: tensor(2.0340, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 689.89it/s]


Test Loss: nan, Test Accuracy: 42.62%


100%|██████████| 117/117 [00:00<00:00, 402.01it/s]


Train Loss: tensor(1.7416, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 591.83it/s]


Test Loss: nan, Test Accuracy: 54.79%


100%|██████████| 117/117 [00:00<00:00, 373.95it/s]


Train Loss: tensor(1.5009, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 622.69it/s]


Test Loss: nan, Test Accuracy: 63.85%


100%|██████████| 117/117 [00:00<00:00, 375.35it/s]


Train Loss: tensor(1.3047, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 778.47it/s]


Test Loss: nan, Test Accuracy: 69.94%


100%|██████████| 117/117 [00:00<00:00, 430.43it/s]


Train Loss: tensor(1.1554, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 745.10it/s]


Test Loss: nan, Test Accuracy: 73.62%


100%|██████████| 117/117 [00:00<00:00, 365.58it/s]


Train Loss: tensor(1.0425, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 464.30it/s]


Test Loss: nan, Test Accuracy: 76.04%


100%|██████████| 117/117 [00:00<00:00, 380.09it/s]


Train Loss: tensor(0.9545, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 786.78it/s]


Test Loss: nan, Test Accuracy: 77.99%


100%|██████████| 117/117 [00:00<00:00, 431.62it/s]


Train Loss: tensor(0.8842, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 648.84it/s]


Test Loss: nan, Test Accuracy: 79.48%


100%|██████████| 117/117 [00:00<00:00, 360.97it/s]


Train Loss: tensor(0.8268, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 734.97it/s]


Test Loss: nan, Test Accuracy: 80.89%


100%|██████████| 117/117 [00:00<00:00, 395.25it/s]


Train Loss: tensor(0.7790, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 560.65it/s]


Test Loss: nan, Test Accuracy: 81.93%


100%|██████████| 117/117 [00:00<00:00, 333.75it/s]


Train Loss: tensor(0.7387, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 745.91it/s]


Test Loss: nan, Test Accuracy: 82.78%


100%|██████████| 117/117 [00:00<00:00, 407.80it/s]


Train Loss: tensor(0.7039, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 723.70it/s]


Test Loss: nan, Test Accuracy: 83.43%


100%|██████████| 117/117 [00:00<00:00, 404.61it/s]


Train Loss: tensor(0.6742, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 606.90it/s]


Test Loss: nan, Test Accuracy: 84.12%


100%|██████████| 117/117 [00:00<00:00, 402.24it/s]


Train Loss: tensor(0.6481, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 721.45it/s]


Test Loss: nan, Test Accuracy: 84.39%


100%|██████████| 117/117 [00:00<00:00, 378.56it/s]


Train Loss: tensor(0.6246, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 723.02it/s]


Test Loss: nan, Test Accuracy: 84.93%


100%|██████████| 117/117 [00:00<00:00, 410.07it/s]


Train Loss: tensor(0.6039, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 674.61it/s]


Test Loss: nan, Test Accuracy: 85.33%


100%|██████████| 117/117 [00:00<00:00, 394.16it/s]


Train Loss: tensor(0.5856, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 716.28it/s]


Test Loss: nan, Test Accuracy: 85.58%


100%|██████████| 117/117 [00:00<00:00, 343.53it/s]


Train Loss: tensor(0.5689, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 711.54it/s]


Test Loss: 0.5462266802787781, Test Accuracy: 85.92%


100%|██████████| 117/117 [00:00<00:00, 423.19it/s]


Train Loss: tensor(0.5539, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 764.66it/s]


Test Loss: nan, Test Accuracy: 86.18%


100%|██████████| 117/117 [00:00<00:00, 349.62it/s]


Train Loss: tensor(0.5397, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 730.28it/s]


Test Loss: nan, Test Accuracy: 86.47%


100%|██████████| 117/117 [00:00<00:00, 378.54it/s]


Train Loss: tensor(0.5270, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 728.76it/s]


Test Loss: nan, Test Accuracy: 86.78%


100%|██████████| 117/117 [00:00<00:00, 389.00it/s]


Train Loss: tensor(0.5151, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 704.44it/s]


Test Loss: nan, Test Accuracy: 87.08%


100%|██████████| 117/117 [00:00<00:00, 406.94it/s]


Train Loss: tensor(0.5044, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 717.64it/s]


Test Loss: nan, Test Accuracy: 87.27%


100%|██████████| 117/117 [00:00<00:00, 338.75it/s]


Train Loss: tensor(0.4944, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 805.28it/s]

Test Loss: nan, Test Accuracy: 87.58%



