# Simple MNIST implementation
The goal of this notebook is to train a simple neural network to classify MNIST digits.  
This is meant to be a simple implementation of an MDL from scratch to test the implementation of the layers(except the convolution layer) and losses.  

## Setup

### Imports

In [1]:
from os.path import join

import plotly.express as px
import kagglehub
import numpy as np
from rich.progress import Progress

from layers import Linear, Relu, SoftMax
from losses import BinaryCrossentropy

### Data extraction

In [2]:
dataset_path = kagglehub.dataset_download("hojjatk/mnist-dataset")
train_image_path = join(dataset_path, 'train-images.idx3-ubyte')
train_labels_path = join(dataset_path, 'train-labels.idx1-ubyte')
test_image_path = join(dataset_path, 't10k-images.idx3-ubyte')
test_labels_path = join(dataset_path, 't10k-labels.idx1-ubyte')

def load_images(path) -> np.ndarray:
    with open(path, 'rb') as f:
        return (
            np.frombuffer(f.read(), dtype=np.uint8)
            [16:]
            .reshape(-1, 28**2)
            / 255
        )

def load_labels(path) -> np.ndarray:
    with open(path, 'rb') as f:
        label_idxs = np.frombuffer(f.read(), dtype=np.uint8)[8:]
        labels = np.eye(10)[label_idxs]
        return labels

train_dataset = load_images(train_image_path)
train_labels = load_labels(train_labels_path)
test_dataset = load_images(test_image_path)
test_labels = load_labels(test_labels_path)


In [3]:
np.argmax(train_labels, axis=1)

array([5, 0, 4, ..., 5, 6, 8], shape=(60000,))

In [4]:
px.imshow(train_dataset[0].reshape(28, 28))

## Model definition

In [5]:
INPUT_SIZE = 28**2
nn: list[Linear|Relu|SoftMax] = [
    Linear(INPUT_SIZE, 64),
    Relu(),
    Linear(64, 64),
    Relu(),
    Linear(64, 10),
    SoftMax(),
]
loss = BinaryCrossentropy()

In [6]:
def forward(activation:np.ndarray) -> np.ndarray:
    for layer in nn:
        activation = layer.forward(activation)
    return activation

def backward(gradient:np.ndarray, learning_rate:float) -> np.ndarray:
    for layer in reversed(nn):
        gradient = layer.backward(gradient, learning_rate)

## Training

In [7]:
NB_EPOCHS = 20
LEARNING_RATE = 1e-5

BTACH_SIZE = 32
gradient_lst = []

with Progress() as progress:
    task = progress.add_task("Training", total=NB_EPOCHS * train_dataset.shape[0])
    for epoch in range(NB_EPOCHS):
        for i, (activation, label) in enumerate(zip(train_dataset, train_labels)):
            activation = forward(activation)
            loss_value = loss.forward(activation, label)
            gradient_lst.append(loss.backward(label))
            if len(gradient_lst) == BTACH_SIZE:
                gradient = np.mean(gradient_lst, axis=0)
                backward(gradient, LEARNING_RATE)
                gradient_lst = []
            progress.update(task, advance=1, description=f"learning rate: {LEARNING_RATE:2f}")

Output()

## Evaluation

In [10]:
accuracies = []
for activation, label in zip(train_dataset, train_labels):
    activation = forward(activation)
    y_pred = activation.argmax()
    y_true = label.argmax()
    accuracies.append(y_pred == y_true)

px.histogram(accuracies)

In [9]:
sum(accuracies) / len(accuracies)

np.float64(0.12873333333333334)

## Conclusion

While the model does learn, it does not perform well.
It seems like we will need to perform the computation in batches.