In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 3060 Laptop GPU
Using device: cuda


In [2]:
#Original source: https://www.kaggle.com/code/hojjatk/read-mnist-dataset
#It has been modified for ease of use w/ pytorch

#You do NOT need to modify ANY code in this file!

import numpy as np
import struct
from array import array
import torch

class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        n = 60000 if "train" in images_filepath else 10000
        labels = torch.zeros((n, 10))
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            l = torch.tensor(array("B", file.read())).unsqueeze(-1)
            l = torch.concatenate((torch.arange(0, n).unsqueeze(-1), l), dim = 1).type(torch.int32)
            labels[l[:,0], l[:,1]] = 1

        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())
        images = torch.zeros((n, 28**2))
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            #img = img.reshape(28, 28)
            images[i, :] = torch.tensor(img)

        return images, labels

    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)

In [3]:
import torch

class Tanh:
    def forward(self, x: torch.tensor) -> torch.tensor:
        return  (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
    
    def backward(self, delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        tanh_org = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
        tanh_derivative = 1 - tanh_org * tanh_org
        return delta * tanh_derivative

class Sigmoid:
    def forward(self, x: torch.tensor) -> torch.tensor:
        return 1 / (1 + torch.exp(-x))
    
    def backward(self, delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        sig_x_org = 1 / (1 + torch.exp(-x))
        sig_x_derivative = (sig_x_org * (1 - sig_x_org))
        return delta * sig_x_derivative

In [4]:
import torch
import numpy as np
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class MLP:
    '''
    Multi-Layer Perceptron (MLP) for MNIST classification.
    Implements forward propagation, backpropagation, and training.
    '''
    
    def __init__(self, layer_sizes: list[int]):
        self.layer_sizes: list[int] = layer_sizes
        self.num_layers = len(layer_sizes) - 1
        self.weights: list[torch.tensor] = []
        self.biases: list[torch.tensor] = []
        self.features: list[torch.tensor] = []  

        self.learning_rate: float = 1
        self.batch_size: int = 1
        self.activation_function: callable[[torch.tensor], torch.tensor] = Tanh

    def set_hp(self, lr: float, bs: int, activation: object) -> None:
        """
        Set hyperparameters for training.
        """
        self.learning_rate = lr
        self.batch_size = bs
        self.activation_function = activation()

    def initialize(self) -> None:
        """
        Initialize all biases to zero and weights using Xavier initialization.
        """
        for i in range(self.num_layers):
            d_in = self.layer_sizes[i]
            d_out = self.layer_sizes[i + 1]
            w_range = np.sqrt(6 / (d_in + d_out))
            W = torch.empty(d_in, d_out, device=device).uniform_(-w_range, w_range)
            self.weights.append(W)
            b = torch.zeros(1, d_out, device=device) 
            self.biases.append(b)
            

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        Forward propagation through all layers.
        Applies activation function to all layers except the last one.
        """
        self.features = [x.to(device)]  

        for i in range(self.num_layers):  
            x = torch.matmul(x, self.weights[i]) + self.biases[i]
            x = self.activation_function.forward(x)  
            self.features.append(x) 
        return x

    def backward(self, delta: torch.Tensor) -> None:
        '''
        This function should backpropagate the provided delta through the entire MLP, and update the weights according to the hyper-parameters
        stored in the class variables.
        '''
        # back propogation starts from the result
        for i in reversed(range(self.num_layers)):
            x = self.features[i]

            delta = self.activation_function.backward(delta,self.features[i+1])
            # Computing gradients
            dW = torch.matmul(x.T,delta) / self.batch_size
            db = torch.sum(delta, dim=0, keepdim=True) / self.batch_size

            # Updating weights and biases with learning rate
            self.weights[i] -= self.learning_rate * dW
            self.biases[i] -= self.learning_rate * db
            delta = torch.matmul(delta,self.weights[i].T)

    # def backward(self, delta: torch.tensor) -> None:
    #     """
    #     Backpropagation through all layers to compute gradients.
    #     Updates weights using gradient descent.
    #     """
    #     # grad_weights = [torch.zeros_like(w) for w in self.weights]
    #     # grad_biases = [torch.zeros_like(b) for b in self.biases]

    #     for i in reversed(range(self.num_layers)):  
    #         X = self.features[i]
    #         dW = torch.matmul(X.T, delta) / self.batch_size  
    #         db = torch.sum(delta,dim=0,keepdim=True) / self.batch_size
            
    #         self.weights[i] -= self.learning_rate * dW
    #         self.biases[i] -= self.learning_rate * db

    #         # if i > 0:
    #         #     delta = (delta @ self.weights[i].T)
    #         #     if i > 1:
    #         #         delta *= self.activation_function.backward(delta,self.features[i-1])

    #         delta = torch.matmul(delta, self.weights[i].T) * self.activation_function.backward(torch.ones_like(X), X)



def TrainMLP(model: MLP, x_train: torch.tensor, y_train: torch.tensor) -> MLP:
    """
    Train the MLP for one epoch using mini-batch gradient descent with GPU support.
    """
    bs = model.batch_size
    N = x_train.shape[0]
    rng = np.random.default_rng()
    idx = rng.permutation(N)

    L = 0  

    for i in tqdm.tqdm(range(N // bs)):
        x = x_train[idx[i * bs:(i + 1) * bs], ...].to(device)
        y = y_train[idx[i * bs:(i + 1) * bs], ...].to(device)

        
        y_hat = model.forward(x)

        
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim=1, keepdim=True)

        
        l = -1 * torch.sum(y * torch.log(p)) ### batch size not required here
        L += l

       
        delta = p - y
        model.backward(delta)

    print("Train Loss:", L / ((N // bs) * bs))



def TestMLP(model: MLP, x_test: torch.tensor, y_test: torch.tensor) -> tuple[float, float]:
    """
    Evaluate the MLP on test data using GPU support.
    """
    bs = model.batch_size
    N = x_test.shape[0]

    rng = np.random.default_rng()
    idx = rng.permutation(N)

    L = 0
    A = 0

    for i in tqdm.tqdm(range(N // bs)):
        x = x_test[idx[i * bs:(i + 1) * bs], ...].to(device)
        y = y_test[idx[i * bs:(i + 1) * bs], ...].to(device)

        y_hat = model.forward(x)

        
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim=1, keepdim=True)

        
        l = -1 * torch.sum(y * torch.log(p))
        L += l.item()

        
        A += torch.sum(torch.argmax(p, dim=1) == torch.argmax(y, dim=1)).item()

    test_loss = L / ((N // bs) * bs)
    test_accuracy = 100 * A / ((N // bs) * bs)

    print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy:.2f}%")

    return test_loss, test_accuracy  


def normalize_mnist() -> tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
    '''
    This function loads the MNIST dataset, then normalizes the "X" values to have zero mean, unit variance.
    '''

    #IMPORTANT!!!#
    #UPDATE THE PATH BELOW!#
    base_path = "C:\\Users\\yoges\\Data_Science_Preparation\\CSCI 5922 Neural Networks and Deep Learning\\Lab Assignments\\Lab1Code\\MNIST\\"
    #^^^^^^^^#


    mnist = MnistDataloader(base_path + "train-images.idx3-ubyte", base_path + "train-labels.idx1-ubyte",
                            base_path + "t10k-images.idx3-ubyte", base_path + "t10k-labels.idx1-ubyte")
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_mean = torch.mean(x_train, dim = 0, keepdim = True)
    x_std = torch.std(x_train, dim = 0, keepdim = True)

    x_train -= x_mean
    x_train /= x_std
    x_train[x_train != x_train] = 0

    x_test -= x_mean
    x_test /= x_std
    x_test[x_test != x_test] = 0


    return x_train, y_train, x_test, y_test

def main():
    """
    Main function to train and evaluate the MLP model on MNIST using GPU.
    """
    x_train, y_train, x_test, y_test = normalize_mnist()

   
    model = MLP([784, 256, 10])  
    model.initialize()
    model.set_hp(lr=1e-3, bs=512, activation=Tanh)  
    
    E = 25
    for _ in range(E):
        TrainMLP(model, x_train, y_train)
        TestMLP(model, x_test, y_test)


if __name__ == "__main__":
    main()


Using device: cuda


100%|██████████| 117/117 [00:00<00:00, 227.12it/s]


Train Loss: tensor(2.1271, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 482.62it/s]


Test Loss: nan, Test Accuracy: 37.97%


100%|██████████| 117/117 [00:00<00:00, 330.52it/s]


Train Loss: tensor(1.8806, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 483.13it/s]


Test Loss: nan, Test Accuracy: 51.79%


100%|██████████| 117/117 [00:00<00:00, 367.57it/s]


Train Loss: tensor(1.7484, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 595.23it/s]


Test Loss: nan, Test Accuracy: 59.92%


100%|██████████| 117/117 [00:00<00:00, 353.85it/s]


Train Loss: tensor(1.6745, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 529.90it/s]


Test Loss: nan, Test Accuracy: 64.94%


100%|██████████| 117/117 [00:00<00:00, 342.46it/s]


Train Loss: tensor(1.6295, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 532.31it/s]


Test Loss: nan, Test Accuracy: 68.40%


100%|██████████| 117/117 [00:00<00:00, 331.43it/s]


Train Loss: tensor(1.5997, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 559.74it/s]


Test Loss: nan, Test Accuracy: 71.08%


100%|██████████| 117/117 [00:00<00:00, 351.77it/s]


Train Loss: tensor(1.5786, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 543.67it/s]


Test Loss: nan, Test Accuracy: 72.45%


100%|██████████| 117/117 [00:00<00:00, 365.76it/s]


Train Loss: tensor(1.5628, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 553.42it/s]


Test Loss: nan, Test Accuracy: 73.55%


100%|██████████| 117/117 [00:00<00:00, 357.43it/s]


Train Loss: tensor(1.5506, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 522.42it/s]


Test Loss: nan, Test Accuracy: 74.54%


100%|██████████| 117/117 [00:00<00:00, 317.32it/s]


Train Loss: tensor(1.5409, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 602.05it/s]


Test Loss: nan, Test Accuracy: 75.17%


100%|██████████| 117/117 [00:00<00:00, 358.70it/s]


Train Loss: tensor(1.5332, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 562.99it/s]


Test Loss: nan, Test Accuracy: 75.61%


100%|██████████| 117/117 [00:00<00:00, 371.11it/s]


Train Loss: tensor(1.5268, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 565.50it/s]


Test Loss: nan, Test Accuracy: 75.86%


100%|██████████| 117/117 [00:00<00:00, 365.83it/s]


Train Loss: tensor(1.5215, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 557.36it/s]


Test Loss: nan, Test Accuracy: 76.16%


100%|██████████| 117/117 [00:00<00:00, 315.27it/s]


Train Loss: tensor(1.5171, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 579.72it/s]


Test Loss: nan, Test Accuracy: 76.30%


100%|██████████| 117/117 [00:00<00:00, 347.91it/s]


Train Loss: tensor(1.5137, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 574.58it/s]


Test Loss: nan, Test Accuracy: 76.48%


100%|██████████| 117/117 [00:00<00:00, 360.74it/s]


Train Loss: tensor(1.5106, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 501.27it/s]


Test Loss: nan, Test Accuracy: 76.57%


100%|██████████| 117/117 [00:00<00:00, 300.83it/s]


Train Loss: tensor(1.5082, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 617.41it/s]


Test Loss: nan, Test Accuracy: 76.76%


100%|██████████| 117/117 [00:00<00:00, 319.17it/s]


Train Loss: tensor(1.5060, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 445.54it/s]


Test Loss: nan, Test Accuracy: 76.80%


100%|██████████| 117/117 [00:00<00:00, 367.67it/s]


Train Loss: tensor(1.5043, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 591.62it/s]


Test Loss: nan, Test Accuracy: 76.93%


100%|██████████| 117/117 [00:00<00:00, 359.95it/s]


Train Loss: tensor(1.5026, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 503.90it/s]


Test Loss: nan, Test Accuracy: 76.93%


100%|██████████| 117/117 [00:00<00:00, 336.17it/s]


Train Loss: tensor(1.5013, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 582.44it/s]


Test Loss: nan, Test Accuracy: 76.83%


100%|██████████| 117/117 [00:00<00:00, 347.11it/s]


Train Loss: tensor(1.5003, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 555.82it/s]


Test Loss: nan, Test Accuracy: 76.95%


100%|██████████| 117/117 [00:00<00:00, 309.74it/s]


Train Loss: tensor(1.4994, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 611.15it/s]


Test Loss: nan, Test Accuracy: 76.98%


100%|██████████| 117/117 [00:00<00:00, 364.80it/s]


Train Loss: tensor(1.4986, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 539.87it/s]


Test Loss: nan, Test Accuracy: 77.07%


100%|██████████| 117/117 [00:00<00:00, 338.22it/s]


Train Loss: tensor(1.4979, device='cuda:0')


100%|██████████| 19/19 [00:00<00:00, 606.75it/s]

Test Loss: nan, Test Accuracy: 77.05%



