In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3060 Laptop GPU
Using device: cuda


In [2]:
#Original source: https://www.kaggle.com/code/hojjatk/read-mnist-dataset
#It has been modified for ease of use w/ pytorch

#You do NOT need to modify ANY code in this file!

import numpy as np
import struct
from array import array
import torch

class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        n = 60000 if "train" in images_filepath else 10000
        labels = torch.zeros((n, 10))
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            l = torch.tensor(array("B", file.read())).unsqueeze(-1)
            l = torch.concatenate((torch.arange(0, n).unsqueeze(-1), l), dim = 1).type(torch.int32)
            labels[l[:,0], l[:,1]] = 1

        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())
        images = torch.zeros((n, 28**2))
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            #img = img.reshape(28, 28)
            images[i, :] = torch.tensor(img)

        return images, labels

    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)

In [3]:
import torch

class Tanh:
    def forward(self, x: torch.tensor) -> torch.tensor:
        return  (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
    
    def backward(self, delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        tanh_org = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
        tanh_derivative = 1 - tanh_org * tanh_org
        return delta * tanh_derivative

class Sigmoid:
    def forward(self, x: torch.tensor) -> torch.tensor:
        return 1 / (1 + torch.exp(-x))
    
    def backward(self, delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        sig_x_org = 1 / (1 + torch.exp(-x))
        sig_x_derivative = (sig_x_org * (1 - sig_x_org))
        return delta * sig_x_derivative

In [4]:
import torch
import numpy as np
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class MLP:
    '''
    Multi-Layer Perceptron (MLP) for MNIST classification.
    Implements forward propagation, backpropagation, and training.
    '''
    
    def __init__(self, layer_sizes: list[int]):
        self.layer_sizes: list[int] = layer_sizes
        self.num_layers = len(layer_sizes) - 1
        self.weights: list[torch.tensor] = []
        self.biases: list[torch.tensor] = []
        self.features: list[torch.tensor] = []  

        self.learning_rate: float = 1
        self.batch_size: int = 1
        self.activation_function: callable[[torch.tensor], torch.tensor] = Sigmoid

    def set_hp(self, lr: float, bs: int, activation: object) -> None:
        """
        Set hyperparameters for training.
        """
        self.learning_rate = lr
        self.batch_size = bs
        self.activation_function = activation()

    def initialize(self) -> None:
        """
        Initialize all biases to zero and weights using Xavier initialization.
        """
        for i in range(self.num_layers):
            d_in = self.layer_sizes[i]
            d_out = self.layer_sizes[i + 1]
            w_range = np.sqrt(6 / (d_in + d_out))
            W = torch.empty(d_in, d_out, device=device).uniform_(-w_range, w_range)
            self.weights.append(W)
            b = torch.zeros(1, d_out, device=device) 
            self.biases.append(b)
            

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Forward propagation through all layers.
        Applies activation function to all layers except the last one.
        """
        self.features = [x.to(device)]  

        for i in range(self.num_layers):  
            x = torch.matmul(x, self.weights[i]) + self.biases[i]
            x = self.activation_function.forward(x)  
            self.features.append(x) 
        return x

    

    def backward(self, delta: torch.Tensor) -> None:
        '''
        This function should backpropagate the provided delta through the entire MLP, and update the weights according to the hyper-parameters
        stored in the class variables.
        '''
        # back propogation starts from the result
        for i in reversed(range(self.num_layers)):
            x = self.features[i]

            delta = self.activation_function.backward(delta,self.features[i+1])
            # Computing gradients
            dW = torch.matmul(x.T,delta) / self.batch_size
            db = torch.sum(delta, dim=0, keepdim=True) / self.batch_size

            # Updating weights and biases with learning rate
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
            delta = torch.matmul(delta,self.weights[i].T)

    # def backward(self, delta: torch.tensor) -> None:
    #     """
    #     Backpropagation through all layers to compute gradients.
    #     Updates weights using gradient descent.
    #     """
    #     # grad_weights = [torch.zeros_like(w) for w in self.weights]
    #     # grad_biases = [torch.zeros_like(b) for b in self.biases]

    #     for i in reversed(range(self.num_layers)):  
    #         X = self.features[i]
    #         dW = torch.matmul(X.T, delta) / self.batch_size  
    #         db = torch.sum(delta,dim=0,keepdim=True) / self.batch_size
            
    #         self.weights[i] -= self.learning_rate * dW
    #         self.biases[i] -= self.learning_rate * db

    #         # if i > 0:
    #         #     delta = (delta @ self.weights[i].T)
    #         #     if i > 1:
    #         #         delta *= self.activation_function.backward(delta,self.features[i-1])

    #         delta = torch.matmul(delta, self.weights[i].T) * self.activation_function.backward(torch.ones_like(X), X)



def TrainMLP(model: MLP, x_train: torch.tensor, y_train: torch.tensor) -> MLP:
    """
    Train the MLP for one epoch using mini-batch gradient descent with GPU support.
    """
    bs = model.batch_size
    N = x_train.shape[0]
    rng = np.random.default_rng()
    idx = rng.permutation(N)

    L = 0  

    for i in tqdm.tqdm(range(N // bs)):
        x = x_train[idx[i * bs:(i + 1) * bs], ...].to(device)
        y = y_train[idx[i * bs:(i + 1) * bs], ...].to(device)

        
        y_hat = model.forward(x)

        
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim=1, keepdim=True)

        
        l = -1 * torch.sum(y * torch.log(p)) ### batch size not required here
        L += l

       
        delta = p - y
        model.backward(delta)

    print("Train Loss:", L / ((N // bs) * bs))



def TestMLP(model: MLP, x_test: torch.tensor, y_test: torch.tensor) -> tuple[float, float]:
    """
    Evaluate the MLP on test data using GPU support.
    """
    bs = model.batch_size
    N = x_test.shape[0]

    rng = np.random.default_rng()
    idx = rng.permutation(N)

    L = 0
    A = 0

    for i in tqdm.tqdm(range(N // bs)):
        x = x_test[idx[i * bs:(i + 1) * bs], ...].to(device)
        y = y_test[idx[i * bs:(i + 1) * bs], ...].to(device)

        y_hat = model.forward(x)

        
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim=1, keepdim=True)

        
        l = -1 * torch.sum(y * torch.log(p))
        L += l.item()

        
        A += torch.sum(torch.argmax(p, dim=1) == torch.argmax(y, dim=1)).item()

    test_loss = L / ((N // bs) * bs)
    test_accuracy = 100 * A / ((N // bs) * bs)

    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy:.2f}%")

    return test_loss, test_accuracy  


def normalize_mnist() -> tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
    '''
    This function loads the MNIST dataset, then normalizes the "X" values to have zero mean, unit variance.
    '''

    #IMPORTANT!!!#
    #UPDATE THE PATH BELOW!#
    base_path = "C:\\Users\\yoges\\Data_Science_Preparation\\CSCI 5922 Neural Networks and Deep Learning\\Lab Assignments\\Lab1Code\\MNIST\\"
    #^^^^^^^^#


    mnist = MnistDataloader(base_path + "train-images.idx3-ubyte", base_path + "train-labels.idx1-ubyte",
                            base_path + "t10k-images.idx3-ubyte", base_path + "t10k-labels.idx1-ubyte")
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_mean = torch.mean(x_train, dim = 0, keepdim = True)
    x_std = torch.std(x_train, dim = 0, keepdim = True)

    x_train -= x_mean
    x_train /= x_std
    x_train[x_train != x_train] = 0

    x_test -= x_mean
    x_test /= x_std
    x_test[x_test != x_test] = 0


    return x_train, y_train, x_test, y_test

def main():
    """
    Main function to train and evaluate the MLP model on MNIST using GPU.
    """
    x_train, y_train, x_test, y_test = normalize_mnist()

   
    model = MLP([784, 256, 10])  
    model.initialize()
    model.set_hp(lr=1e-3, bs=512, activation=Sigmoid)  
    
    E = 25
    for _ in range(E):
        TrainMLP(model, x_train, y_train)
        TestMLP(model, x_test, y_test)


if __name__ == "__main__":
    main()


Using device: cuda


100%|██████████| 117/117 [00:00<00:00, 238.62it/s]


Train Loss: tensor(2.3170, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 495.17it/s]


Test Loss: nan, Test Accuracy: 13.59%


100%|██████████| 117/117 [00:00<00:00, 393.66it/s]


Train Loss: tensor(2.3106, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 590.09it/s]


Test Loss: nan, Test Accuracy: 14.05%


100%|██████████| 117/117 [00:00<00:00, 408.75it/s]


Train Loss: tensor(2.3043, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 557.36it/s]


Test Loss: nan, Test Accuracy: 14.77%


100%|██████████| 117/117 [00:00<00:00, 385.18it/s]


Train Loss: tensor(2.2979, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 580.69it/s]


Test Loss: nan, Test Accuracy: 15.76%


100%|██████████| 117/117 [00:00<00:00, 343.02it/s]


Train Loss: tensor(2.2915, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 605.69it/s]


Test Loss: nan, Test Accuracy: 17.07%


100%|██████████| 117/117 [00:00<00:00, 405.55it/s]


Train Loss: tensor(2.2852, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 482.50it/s]


Test Loss: nan, Test Accuracy: 18.17%


100%|██████████| 117/117 [00:00<00:00, 382.03it/s]


Train Loss: tensor(2.2789, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 603.85it/s]


Test Loss: nan, Test Accuracy: 19.52%


100%|██████████| 117/117 [00:00<00:00, 393.84it/s]


Train Loss: tensor(2.2726, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 601.57it/s]


Test Loss: nan, Test Accuracy: 20.87%


100%|██████████| 117/117 [00:00<00:00, 373.17it/s]


Train Loss: tensor(2.2664, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 602.80it/s]


Test Loss: nan, Test Accuracy: 22.13%


100%|██████████| 117/117 [00:00<00:00, 427.01it/s]


Train Loss: tensor(2.2602, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 621.33it/s]


Test Loss: nan, Test Accuracy: 23.26%


100%|██████████| 117/117 [00:00<00:00, 388.90it/s]


Train Loss: tensor(2.2540, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 589.95it/s]


Test Loss: nan, Test Accuracy: 24.67%


100%|██████████| 117/117 [00:00<00:00, 387.52it/s]


Train Loss: tensor(2.2480, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 547.56it/s]


Test Loss: nan, Test Accuracy: 26.19%


100%|██████████| 117/117 [00:00<00:00, 378.14it/s]


Train Loss: tensor(2.2419, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 590.68it/s]


Test Loss: nan, Test Accuracy: 27.85%


100%|██████████| 117/117 [00:00<00:00, 398.96it/s]


Train Loss: tensor(2.2359, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 611.18it/s]


Test Loss: nan, Test Accuracy: 29.45%


100%|██████████| 117/117 [00:00<00:00, 400.28it/s]


Train Loss: tensor(2.2300, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 563.64it/s]


Test Loss: nan, Test Accuracy: 31.52%


100%|██████████| 117/117 [00:00<00:00, 391.11it/s]


Train Loss: tensor(2.2240, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 597.10it/s]


Test Loss: nan, Test Accuracy: 33.70%


100%|██████████| 117/117 [00:00<00:00, 372.94it/s]


Train Loss: tensor(2.2182, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 581.92it/s]


Test Loss: nan, Test Accuracy: 35.84%


100%|██████████| 117/117 [00:00<00:00, 355.76it/s]


Train Loss: tensor(2.2125, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 517.61it/s]


Test Loss: nan, Test Accuracy: 37.84%


100%|██████████| 117/117 [00:00<00:00, 323.71it/s]


Train Loss: tensor(2.2067, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 674.29it/s]


Test Loss: nan, Test Accuracy: 39.63%


100%|██████████| 117/117 [00:00<00:00, 378.80it/s]


Train Loss: tensor(2.2011, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 658.69it/s]


Test Loss: nan, Test Accuracy: 41.14%


100%|██████████| 117/117 [00:00<00:00, 374.62it/s]


Train Loss: tensor(2.1955, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 521.05it/s]


Test Loss: nan, Test Accuracy: 42.77%


100%|██████████| 117/117 [00:00<00:00, 371.81it/s]


Train Loss: tensor(2.1900, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 625.25it/s]


Test Loss: nan, Test Accuracy: 44.14%


100%|██████████| 117/117 [00:00<00:00, 385.95it/s]


Train Loss: tensor(2.1844, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 394.59it/s]


Test Loss: nan, Test Accuracy: 45.34%


100%|██████████| 117/117 [00:00<00:00, 385.31it/s]


Train Loss: tensor(2.1791, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 644.11it/s]


Test Loss: nan, Test Accuracy: 46.74%


100%|██████████| 117/117 [00:00<00:00, 358.40it/s]


Train Loss: tensor(2.1737, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 597.86it/s]

Test Loss: nan, Test Accuracy: 47.82%



