In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hojjatk/mnist-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/hojjatk/mnist-dataset?dataset_version_number=1...


100%|██████████| 22.0M/22.0M [00:03<00:00, 7.21MB/s]


Extracting files...
Path to dataset files: /Users/Marius/.cache/kagglehub/datasets/hojjatk/mnist-dataset/versions/1


In [11]:

from pathlib import Path
import numpy as np
import struct
import torch
import torch.nn as nn
root = Path.home() / ".cache/kagglehub/datasets/hojjatk/mnist-dataset/versions/1"

def load_idx_images(path):
    with path.open("rb") as f:
        magic, num, rows, cols = struct.unpack(">IIII", f.read(16))
        data = np.frombuffer(f.read(), dtype=np.uint8).reshape(num,
    rows, cols)
    return data

def load_idx_labels(path):
    with path.open("rb") as f:
        magic, num = struct.unpack(">II", f.read(8))
        labels = np.frombuffer(f.read(), dtype=np.uint8)
    return labels

X_train = load_idx_images(root / "train-images.idx3-ubyte")
y_train = load_idx_labels(root / "train-labels.idx1-ubyte")
X_test  = load_idx_images(root / "t10k-images.idx3-ubyte")
y_test  = load_idx_labels(root / "t10k-labels.idx1-ubyte")

X_train = torch.tensor(X_train, dtype=torch.float32) / 255.0
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32) / 255.0
y_test = torch.tensor(y_test, dtype=torch.long)

print(X_train.shape, y_train.shape)


torch.Size([60000, 28, 28]) torch.Size([60000])


In [19]:
class MLPScratch(nn.Module):
    def __init__(self, input_size, num_outputs, num_hiddens,
                 lr=0.1, sigma=0.1, batch_size=256, num_epochs=10):
        super().__init__()
        self.input_size = input_size
        self.W1 = nn.Parameter(torch.randn(input_size, num_hiddens) * sigma)
        self.b1 = nn.Parameter(torch.zeros(num_hiddens))
        self.W2 = nn.Parameter(torch.randn(num_hiddens, 128) * sigma)
        self.b2 = nn.Parameter(torch.zeros(128))
        self.W3 = nn.Parameter(torch.randn(128, num_outputs) * sigma)
        self.b3 = nn.Parameter(torch.zeros(num_outputs))
        self.lr = lr
        self.batch_size = batch_size
        self.num_epochs = num_epochs
    
    def relu(self,x):
        a = torch.zeros_like(x)
        return torch.clamp(x,min=a)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        X = x.view(-1,self.input_size) ## flatten from (batch_size, 28, 28) to (batch_size, 784)
        h1 = self.relu(X @ self.W1 + self.b1)
        h = self.relu(h1 @ self.W2 + self.b2)
        return h @ self.W3 + self.b3

model = MLPScratch(input_size=28*28, num_outputs=10, num_hiddens=256)
loss_fn = nn.CrossEntropyLoss()
## optimizer takes parameters to optimize
optimizer = torch.optim.SGD([model.W1, model.b1, model.W2, model.b2, model.W3, model.b3], lr=model.lr)
num_batches = X_train.shape[0] // model.batch_size
for epoch in range(model.num_epochs):
    for i in range(num_batches):
        X_batch = X_train[i*model.batch_size:(i+1)*model.batch_size]
        y_batch = y_train[i*model.batch_size:(i+1)*model.batch_size]
        
        logits = model.forward(X_batch)
        loss = loss_fn(logits, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Evaluate on test set
with torch.no_grad():
    test_logits = model.forward(X_test)
    test_loss = loss_fn(test_logits, y_test)
    predictions = torch.argmax(test_logits, dim=1)
    accuracy = (predictions == y_test).float().mean()
    print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item()*100:.2f}%")



Epoch 1, Loss: 0.3141
Epoch 2, Loss: 0.2593
Epoch 3, Loss: 0.2277
Epoch 4, Loss: 0.2099
Epoch 5, Loss: 0.1961
Epoch 6, Loss: 0.1841
Epoch 7, Loss: 0.1735
Epoch 8, Loss: 0.1651
Epoch 9, Loss: 0.1565
Epoch 10, Loss: 0.1477
Test Loss: 0.1115, Test Accuracy: 96.40%


In [20]:
## With high level API
class MLPHighLevel(nn.Module):
    def __init__(self, input_size, num_outputs, num_hiddens):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(start_dim=1, end_dim=-1), # flatten input, from (batch_size, 28, 28) to (batch_size, 784)
            nn.Linear(input_size, num_hiddens), 
            nn.ReLU(),
            nn.Linear(num_hiddens, 128),
            nn.ReLU(),
            nn.Linear(128, num_outputs)
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

model_hl = MLPHighLevel(input_size=28*28, num_outputs=10, num_hiddens=256)
loss_fn_hl = nn.CrossEntropyLoss()
optimizer_hl = torch.optim.SGD(model_hl.parameters(), lr=0.1)
num_epochs_hl = 10
batch_size_hl = 256
num_batches_hl = X_train.shape[0] // batch_size_hl
for epoch in range(num_epochs_hl):
    for i in range(num_batches_hl):
        X_batch = X_train[i*batch_size_hl:(i+1)*batch_size_hl]
        y_batch = y_train[i*batch_size_hl:(i+1)*batch_size_hl]
        
        logits = model_hl.forward(X_batch)
        loss = loss_fn_hl(logits, y_batch)
        
        optimizer_hl.zero_grad()
        loss.backward()
        optimizer_hl.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Evaluate on test set
with torch.no_grad():
    test_logits_hl = model_hl.forward(X_test)
    test_loss_hl = loss_fn_hl(test_logits_hl, y_test)
    predictions_hl = torch.argmax(test_logits_hl, dim=1)
    accuracy_hl = (predictions_hl == y_test).float().mean()
    print(f"Test Loss: {test_loss_hl.item():.4f}, Test Accuracy: {accuracy_hl.item()*100:.2f}%")

Epoch 1, Loss: 0.3585
Epoch 2, Loss: 0.3285
Epoch 3, Loss: 0.2978
Epoch 4, Loss: 0.2716
Epoch 5, Loss: 0.2481
Epoch 6, Loss: 0.2296
Epoch 7, Loss: 0.2147
Epoch 8, Loss: 0.2027
Epoch 9, Loss: 0.1910
Epoch 10, Loss: 0.1804
Test Loss: 0.1217, Test Accuracy: 96.46%


In [27]:
X_train = X_train.reshape(-1,1,28,28) # reshape to add channel dimension
# channel dimension is 1 for grayscale images, it's needed for Conv2d layers
# because Conv2d expects input shape (batch_size, channels, height, width)



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNScratch(nn.Module):
    def __init__(self, lr=0.1, sigma=0.1, batch_size=256, num_epochs=10):
        super().__init__()
        self.lr = lr
        self.batch_size = batch_size
        self.num_epochs = num_epochs

        # Conv1: in_channels=1, out_channels=16, kernel=3x3
        # basically 16 filters of size 1x3x3, each filter creates one output channel/activation map
        # which has to be equal to the number of input channels (1 for grayscale images)
        self.Wc1 = nn.Parameter(torch.randn(16, 1, 3, 3) * sigma)
        self.bc1 = nn.Parameter(torch.zeros(16))
        # Conv2: in_channels=16, out_channels=32, kernel=3x3
        # basically 32 filters of size 16x3x3, each filter creates one output channel/activation map
        # each of the 32 filters looks at all the 16 activation maps/channels from previous layer
        self.Wc2 = nn.Parameter(torch.randn(32, 16, 3, 3) * sigma)
        self.bc2 = nn.Parameter(torch.zeros(32))
        
        # With padding=1: 28->28 after each conv
        # After maxpool 2x2 (stride 2): 28->14
        flat = 32 * 14 * 14  # 32 channels, 14x14 feature map size

        # Final linear: flat -> 10
        self.W = nn.Parameter(torch.randn(flat, 10) * sigma)
        self.b = nn.Parameter(torch.zeros(10))

    def relu(self, x):
        return torch.clamp(x, min=0)

    def forward(self, x):
        # x should be (B, 1, 28, 28)
        h = F.conv2d(x, self.Wc1, self.bc1, stride=1, padding=1)     # (B,16,28,28)
        h = self.relu(h)
        h = F.conv2d(h, self.Wc2, self.bc2, stride=1, padding=1)     # (B,32,28,28)
        h = self.relu(h)
        h = F.max_pool2d(h, kernel_size=2, stride=2)  # (B,32,14,14), scales down by factor 2
        h = h.view(h.shape[0], -1)             # (B, 32*14*14), flatten
        logits = h @ self.W + self.b           # (B,10)
        return logits


In [46]:

model_cnn = CNNScratch()
logits = model_cnn.forward(X_train[:4])  # pass a small batch to see shapes
print("logits", logits.shape)


logits torch.Size([4, 10])


In [47]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_cnn.parameters(), lr=model_cnn.lr)
num_batches = X_train.shape[0] // model_cnn.batch_size
for epoch in range(model_cnn.num_epochs):
    for i in range(num_batches):
        X_batch = X_train[i*model_cnn.batch_size:(i+1)*model_cnn.batch_size]
        y_batch = y_train[i*model_cnn.batch_size:(i+1)*model_cnn.batch_size]
        
        logits = model_cnn.forward(X_batch)
        loss = loss_fn(logits, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    

Epoch 1, Loss: 0.3376
Epoch 2, Loss: 0.2997
Epoch 3, Loss: 0.2776
Epoch 4, Loss: 0.2524
Epoch 5, Loss: 0.2338
Epoch 6, Loss: 0.2154
Epoch 7, Loss: 0.2025
Epoch 8, Loss: 0.1892
Epoch 9, Loss: 0.1768
Epoch 10, Loss: 0.1657


In [48]:
# Evaluate on test set
with torch.no_grad():
    X_test = X_test.reshape(-1,1,28,28)  # reshape test set
    test_logits = model_cnn.forward(X_test)
    test_loss = loss_fn(test_logits, y_test)
    predictions = torch.argmax(test_logits, dim=1)
    accuracy = (predictions == y_test).float().mean()
    print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item()*100:.2f}%")


Test Loss: 0.0582, Test Accuracy: 98.09%


In [51]:
X_train

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0.

In [87]:
class CNNHighLevel(nn.Module):
    def __init__(self, input_size, num_outputs):
        super().__init__()
        self.net = nn.Sequential(
            # First convolutional layer, creates 16 output channels/activation maps with 3x3 filters
            nn.Conv2d(1,16,3, padding=1),
            nn.ReLU(),
            # Second convolutional layer, creates 32 output channels/activation maps with 3x3 filters based on the 16 from previous layer
            nn.Conv2d(16,32,3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32,64,3, padding=1), 
            nn.ReLU(),
            nn.MaxPool2d(2), # scales down by factor 2 so from 28x28 to 14x14
            nn.Flatten(), # 14x14x64 = 12544
            nn.Linear(12544, num_outputs),
            

        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

In [88]:
X_train  = X_train.reshape(-1,1,28,28)  # reshape train set
model_cnn_high = CNNHighLevel(input_size=28*28, num_outputs=10)
logits = model_cnn_high.forward(X_train[:4])  # pass a small batch to see shapes
print("logits", logits.shape)

logits torch.Size([4, 10])


In [89]:
#TRAINING BATCHES FOR CNN HIGH LEVEL
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_cnn_high.parameters(), lr=0.1)
num_epochs = 10
batch_size = 256
num_batches = X_train.shape[0] // batch_size
for epoch in range(num_epochs):
    for i in range(num_batches):
        X_batch = X_train[i*batch_size:(i+1)*batch_size]
        y_batch = y_train[i*batch_size:(i+1)*batch_size]
        
        logits = model_cnn_high.forward(X_batch)
        loss = loss_fn(logits, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


KeyboardInterrupt: 

In [64]:
#EVALUATION FOR CNN HIGH LEVEL
with torch.no_grad():
    X_test = X_test.reshape(-1,1,28,28)  # reshape test set
    test_logits = model_cnn_high.forward(X_test)
    test_loss = loss_fn(test_logits, y_test)
    predictions = torch.argmax(test_logits, dim=1)
    accuracy = (predictions == y_test).float().mean()
    print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item()*100:.2f}%")



Test Loss: 0.0530, Test Accuracy: 98.32%


In [90]:
### Fashion MNIST with CNN High Level API
# Load Fashion-MNIST from local archive folder
archive_root = Path.cwd() / "archive"  # Path from working directory to archive

FX_train = load_idx_images(archive_root / "train-images-idx3-ubyte")
Fy_train = load_idx_labels(archive_root / "train-labels-idx1-ubyte")
FX_test  = load_idx_images(archive_root / "t10k-images-idx3-ubyte")
Fy_test  = load_idx_labels(archive_root / "t10k-labels-idx1-ubyte")

# Convert to tensors and normalize
FX_train = torch.tensor(FX_train, dtype=torch.float32) / 255.0
Fy_train = torch.tensor(Fy_train, dtype=torch.long)
FX_test  = torch.tensor(FX_test, dtype=torch.float32) / 255.0
Fy_test  = torch.tensor(Fy_test, dtype=torch.long)

# Add channel dimension for CNNs
FX_train = FX_train.reshape(-1, 1, 28, 28)
FX_test  = FX_test.reshape(-1, 1, 28, 28)

print(FX_train.shape, Fy_train.shape)


torch.Size([60000, 1, 28, 28]) torch.Size([60000])


In [93]:
model_cnn = CNNHighLevel(input_size=28*28, num_outputs=10)

#training loop same as before, just using FX_train and Fy_train instead of X_train and y_train
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_cnn.parameters(), lr=0.1)
batch_size = 256
num_batches = FX_train.shape[0] // batch_size
num_epochs = 15
for epoch in range(num_epochs):
    for i in range(num_batches):
        X_batch = FX_train[i*batch_size:(i+1)*batch_size]
        y_batch = Fy_train[i*batch_size:(i+1)*batch_size]
        
        logits = model_cnn.forward(X_batch)
        loss = loss_fn(logits, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 2.3095
Epoch 2, Loss: 2.3090
Epoch 3, Loss: 2.3095
Epoch 4, Loss: 2.3102
Epoch 5, Loss: 2.3108
Epoch 6, Loss: 2.3112
Epoch 7, Loss: 2.3115
Epoch 8, Loss: 2.3115
Epoch 9, Loss: 2.3116
Epoch 10, Loss: 2.3116
Epoch 11, Loss: 2.3116
Epoch 12, Loss: 2.3116
Epoch 13, Loss: 2.3115
Epoch 14, Loss: 2.3115
Epoch 15, Loss: 2.3115


#### LOG
With 2 conv layers:
conv1 16 filters 3x3
conv2 32 filters 3x3
Before padding: 86.67%

After: 87.90%

With 3 conv layers
conv1 16 filters 3x3
conv2 32 filters 3x3
conv3 64 filters 3x3
88,07%

In [92]:
# Evaluate on test set
with torch.no_grad():
    test_logits = model_cnn.forward(FX_test)
    test_loss = loss_fn(test_logits, Fy_test)
    predictions = torch.argmax(test_logits, dim=1)
    accuracy = (predictions == Fy_test).float().mean()
    print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item()*100:.2f}%")
    

Test Loss: 0.3377, Test Accuracy: 88.07%


tensor([[-2.9932, -5.1288, -2.9457,  ...,  5.4631,  2.2260,  9.1247],
        [ 3.6042, -5.2341, 12.4575,  ..., -9.7423, -2.5794, -4.3412],
        [-0.2589, 11.5586, -4.1436,  ..., -7.3697, -3.9229, -7.0524],
        ...,
        [ 0.5151, -6.1618,  0.0427,  ..., -3.2563,  5.8552, -8.8672],
        [-3.4202,  9.3497, -5.1509,  ..., -4.6715, -3.6099, -2.7315],
        [-2.1577, -3.2416,  0.2203,  ...,  3.2738,  1.9366, -0.6573]])