# Simple MNIST implementation
The goal of this notebook is to train a simple neural network to classify MNIST digits.  
This is meant to be a simple implementation of an MDL from scratch to test the implementation of the layers(except the convolution layer) and losses.  

## Setup

### Imports

In [2]:
from os.path import join

import plotly.express as px
import kagglehub
import numpy as np
from rich.progress import Progress
import pandas as pd

from layers import Linear, Relu, Sigmoid
from losses import BinaryCrossentropy

### Data extraction

In [3]:
dataset_path = kagglehub.dataset_download("hojjatk/mnist-dataset")
train_image_path = join(dataset_path, 'train-images.idx3-ubyte')
train_labels_path = join(dataset_path, 'train-labels.idx1-ubyte')
test_image_path = join(dataset_path, 't10k-images.idx3-ubyte')
test_labels_path = join(dataset_path, 't10k-labels.idx1-ubyte')

def load_images(path) -> np.ndarray:
    with open(path, 'rb') as f:
        return (
            np.frombuffer(f.read(), dtype=np.uint8)
            [16:]
            .reshape(-1, 28**2)
            / 255
        )

def load_labels(path) -> np.ndarray:
    with open(path, 'rb') as f:
        label_idxs = np.frombuffer(f.read(), dtype=np.uint8)[8:]
        labels = np.eye(10)[label_idxs]
        return labels

train_dataset = load_images(train_image_path)
train_labels = load_labels(train_labels_path)
test_dataset = load_images(test_image_path)
test_labels = load_labels(test_labels_path)


In [4]:
np.argmax(train_labels, axis=1)

array([5, 0, 4, ..., 5, 6, 8], shape=(60000,))

In [5]:
px.imshow(train_dataset[0].reshape(28, 28))

## Model definition

In [6]:
INPUT_SIZE = 28**2
nn: list[Linear|Relu|Sigmoid] = [
    Linear(INPUT_SIZE, 64),
    Relu(),
    Linear(64, 10),
    Sigmoid(),
]
loss = BinaryCrossentropy()

In [7]:
def forward(activation:np.ndarray) -> np.ndarray:
    for layer in nn:
        activation = layer.forward(activation)
    return activation

def backward(gradient:np.ndarray, learning_rate:float) -> np.ndarray:
    for layer in reversed(nn):
        gradient = layer.backward(gradient, learning_rate)

def compute_accuracy(activation:np.ndarray, labels:np.ndarray) -> float:
    return (activation.argmax(axis=1) == labels.argmax(axis=1)).mean()


In [8]:
activations = forward(train_dataset)
y_pred = activations.argmax(axis=1)
y_true = train_labels.argmax(axis=1)
accuracy = (y_pred == y_true).mean()

accuracy

np.float64(0.06658333333333333)

In [9]:
train_labels.sum(axis=0) / train_labels.sum()

array([0.09871667, 0.11236667, 0.0993    , 0.10218333, 0.09736667,
       0.09035   , 0.09863333, 0.10441667, 0.09751667, 0.09915   ])

In [10]:
# Randomize the dataset and labels
indices = np.random.permutation(train_dataset.shape[0])
train_dataset = train_dataset[indices]
train_labels = train_labels[indices]

## Training

In [11]:
stats = {
    "losses": [],
    "accuracies": [],
}

In [12]:
NB_EPOCHS = 30
LEARNING_RATE = 0.02
BATCH_SIZE = 128

with Progress() as progress:
    task = progress.add_task("Training", total=NB_EPOCHS * train_dataset.shape[0] // BATCH_SIZE)
    for epoch in range(NB_EPOCHS):
        for i in range(0, train_dataset.shape[0], BATCH_SIZE):
            batch = train_dataset[i:i+BATCH_SIZE]
            labels = train_labels[i:i+BATCH_SIZE]
            activations = forward(batch)
            mean_loss = loss.forward(activations, labels)
            gradients = loss.backward(activations, labels)
            backward(gradients, LEARNING_RATE)
            progress.update(task, advance=1, description=f"loss: {mean_loss:2f}")
        dataset_activations = forward(train_dataset)
        mean_dataset_loss = loss.forward(dataset_activations, train_labels)
        stats["losses"].append(mean_dataset_loss)
        stats["accuracies"].append(compute_accuracy(dataset_activations, train_labels))

Output()

In [13]:
fig = (
    px.scatter(
        (
            pd.DataFrame(stats)
            .assign(batch=range(len(stats["losses"])))
            .melt(id_vars="batch", value_vars=["losses", "accuracies"])
        ),
        x="batch",
        y="value",
        color="variable",
        facet_row="variable",
        title="Training Metrics Over Btaches",
        height=600,
    )
    .update_yaxes(matches=None)
    .update_yaxes(showticklabels=True)
)
fig.show()