# Simple MNIST implementation
The goal of this notebook is to train a simple neural network to classify MNIST digits.  
This is meant to be a simple implementation of an MDL from scratch to test the implementation of the layers(except the convolution layer) and losses.  

## Setup

### Imports

In [1]:
from os.path import join

import plotly.express as px
import kagglehub
import numpy as np
from rich.progress import Progress
import pandas as pd

from layers import Linear, Relu, Sigmoid
from losses import BinaryCrossentropy

### Data extraction

In [2]:
dataset_path = kagglehub.dataset_download("hojjatk/mnist-dataset")
train_image_path = join(dataset_path, 'train-images.idx3-ubyte')
train_labels_path = join(dataset_path, 'train-labels.idx1-ubyte')
test_image_path = join(dataset_path, 't10k-images.idx3-ubyte')
test_labels_path = join(dataset_path, 't10k-labels.idx1-ubyte')

def load_images(path) -> np.ndarray:
    with open(path, 'rb') as f:
        return (
            np.frombuffer(f.read(), dtype=np.uint8)
            [16:]
            .reshape(-1, 28**2)
            / 255
        )

def load_labels(path) -> np.ndarray:
    with open(path, 'rb') as f:
        label_idxs = np.frombuffer(f.read(), dtype=np.uint8)[8:]
        labels = np.eye(10)[label_idxs]
        return labels

train_dataset = load_images(train_image_path)
train_labels = load_labels(train_labels_path)
test_dataset = load_images(test_image_path)
test_labels = load_labels(test_labels_path)


In [3]:
np.argmax(train_labels, axis=1)

array([5, 0, 4, ..., 5, 6, 8], shape=(60000,))

In [4]:
px.imshow(train_dataset[0].reshape(28, 28))

## Model definition

In [45]:
INPUT_SIZE = 28**2
nn: list[Linear|Relu|Sigmoid] = [
    Linear(INPUT_SIZE, 64),
    Relu(),
    Linear(64, 10),
    Sigmoid(),
]
loss = BinaryCrossentropy()

In [25]:
def forward(activation:np.ndarray) -> np.ndarray:
    for layer in nn:
        activation = layer.forward(activation)
    return activation

def backward(gradient:np.ndarray, learning_rate:float) -> np.ndarray:
    for layer in reversed(nn):
        gradient = layer.backward(gradient, learning_rate)

def accuracy(activation:np.ndarray, labels:np.ndarray) -> float:
    return (activation.argmax(axis=1) == labels.argmax(axis=1)).mean()


## Training

In [47]:
NB_EPOCHS = 4
LEARNING_RATE = 1e-7
BATCH_SIZE = 128

stats = {
    "losses": [],
    "accuracies": [],
}

# Randomize the dataset and labels
indices = np.random.permutation(train_dataset.shape[0])
train_dataset = train_dataset[indices]
train_labels = train_labels[indices]


with Progress() as progress:
    task = progress.add_task("Training", total=NB_EPOCHS * train_dataset.shape[0] // BATCH_SIZE)
    for epoch in range(NB_EPOCHS):
        for i in range(0, train_dataset.shape[0], BATCH_SIZE):
            batch = train_dataset[i:i+BATCH_SIZE]
            labels = train_labels[i:i+BATCH_SIZE]
            activation = forward(batch)
            losses = loss.forward(activation, labels)
            stats["losses"].append(losses.mean())
            stats["accuracies"].append(accuracy(activation, labels))
            progress.update(task, advance=1, description=f"loss: {stats['losses'][-1]:2f}")
            backward(losses, LEARNING_RATE)

# Create a DataFrame from the stats dictionary
stats = pd.DataFrame({
    "batch": list(range(len(stats["losses"]))),
    "loss": stats["losses"],
    "accuracy": stats["accuracies"]
})

# Create the line plot with facets and different y-axes for loss and accuracy
fig = px.line(
    stats,
    x="batch",
    y="loss",
    title="Training Metrics Over Epochs",
    height=600
)
fig.update_yaxes(matches=None)  # Allow different y-axes scales
fig.show()


Output()

## Evaluation

In [15]:
activation = forward(train_dataset)
y_pred = activation.argmax(axis=1)
y_true = train_labels.argmax(axis=1)
accuracy = (y_pred == y_true).mean()

accuracy

np.float64(0.17153333333333334)

In [13]:
activation[0]

array([2.52170976e-53, 9.34962973e-24, 2.59598672e-52, 7.35484686e-51,
       9.29921082e-44, 3.21470185e-49, 2.09441078e-41, 7.24184871e-72,
       3.51394128e-54, 5.48907100e-46])

In [9]:
sum(accuracies) / len(accuracies)

np.float64(0.15876666666666667)

## Conclusion

While the model does learn, it does not perform well.
It seems like we will need to perform the computation in batches.