Basic CNN Implementation on Fashion MNIST

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np

In [4]:
# Convert images to tensors and normalize pixel values
# Fashion-MNIST images are grayscale (1 channel)
transform = transforms.Compose([
    transforms.ToTensor(),                     # [0,255] -> [0,1]
    transforms.Normalize((0.5,), (0.5,))       # normalize [0,1] -> [-1,1]
])

In [5]:
full_train_dataset = datasets.FashionMNIST(
    root="data", train=True, download=True, transform=transform
)

test_dataset = datasets.FashionMNIST(
    root="data", train=False, download=True, transform=transform
)

train_size = int(0.9 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size

train_dataset, val_dataset = random_split(
    full_train_dataset, [train_size, val_size]
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=256, shuffle=False)

100.0%
100.0%
100.0%
100.0%


In [6]:
def create_cnn_layers():
    layers = {
        "conv1": nn.Conv2d(1, 32, 3, padding=1),
        "conv2": nn.Conv2d(32, 64, 3, padding=1),
        "conv3": nn.Conv2d(64, 128, 3, padding=1),
        "fc1": nn.Linear(128 * 3 * 3, 256),
        "fc2": nn.Linear(256, 10),
        "pool": nn.MaxPool2d(2, 2)
    }
    return layers

def forward_pass(x, layers):
    x = layers["pool"](F.relu(layers["conv1"](x)))
    x = layers["pool"](F.relu(layers["conv2"](x)))
    x = layers["pool"](F.relu(layers["conv3"](x)))

    x = x.view(x.size(0), -1)
    x = F.relu(layers["fc1"](x))
    logits = layers["fc2"](x)
    return logits

In [7]:
def train_one_epoch(layers, loader, optimizer, loss_fn):
    total_loss = 0.0

    for images, labels in loader:
        logits = forward_pass(images, layers)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * images.size(0)

    return total_loss / len(loader.dataset)

In [8]:
def evaluate_model(layers, loader):
    y_true, y_pred = [], []

    with torch.no_grad():
        for images, labels in loader:
            logits = forward_pass(images, layers)
            predictions = logits.argmax(dim=1)

            y_true.extend(labels.numpy())
            y_pred.extend(predictions.numpy())

    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro"
    )
    cm = confusion_matrix(y_true, y_pred)

    return acc, p, r, f1, cm

In [9]:
if __name__ == "__main__":

    layers = create_cnn_layers()

    optimizer = torch.optim.Adam(
        [p for layer in layers.values() for p in layer.parameters()],
        lr=1e-3
    )

    loss_fn = nn.CrossEntropyLoss()
    epochs = 15

    for epoch in range(epochs):
        train_loss = train_one_epoch(
            layers, train_loader, optimizer, loss_fn
        )

        val_acc, val_p, val_r, val_f1, _ = evaluate_model(
            layers, val_loader
        )

        print(
            f"Epoch {epoch+1}/{epochs} | "
            f"Train Loss: {train_loss:.4f} | "
            f"Val Acc: {val_acc:.4f} | "
            f"Val F1: {val_f1:.4f}"
        )

    test_acc, test_p, test_r, test_f1, test_cm = evaluate_model(
        layers, test_loader
    )

    print("\nTest Results:")
    print("Accuracy :", test_acc)
    print("Precision:", test_p)
    print("Recall   :", test_r)
    print("F1-score :", test_f1)
    print("Confusion Matrix:\n", test_cm)


Epoch 1/15 | Train Loss: 0.5314 | Val Acc: 0.8748 | Val F1: 0.8774
Epoch 2/15 | Train Loss: 0.3051 | Val Acc: 0.8968 | Val F1: 0.8957
Epoch 3/15 | Train Loss: 0.2559 | Val Acc: 0.9017 | Val F1: 0.9013
Epoch 4/15 | Train Loss: 0.2277 | Val Acc: 0.9130 | Val F1: 0.9125
Epoch 5/15 | Train Loss: 0.2027 | Val Acc: 0.9122 | Val F1: 0.9119
Epoch 6/15 | Train Loss: 0.1795 | Val Acc: 0.9223 | Val F1: 0.9223
Epoch 7/15 | Train Loss: 0.1608 | Val Acc: 0.9163 | Val F1: 0.9166
Epoch 8/15 | Train Loss: 0.1466 | Val Acc: 0.9195 | Val F1: 0.9194
Epoch 9/15 | Train Loss: 0.1292 | Val Acc: 0.9152 | Val F1: 0.9150
Epoch 10/15 | Train Loss: 0.1164 | Val Acc: 0.9198 | Val F1: 0.9203
Epoch 11/15 | Train Loss: 0.1012 | Val Acc: 0.9247 | Val F1: 0.9244
Epoch 12/15 | Train Loss: 0.0873 | Val Acc: 0.9242 | Val F1: 0.9231
Epoch 13/15 | Train Loss: 0.0755 | Val Acc: 0.9233 | Val F1: 0.9220
Epoch 14/15 | Train Loss: 0.0662 | Val Acc: 0.9237 | Val F1: 0.9221
Epoch 15/15 | Train Loss: 0.0588 | Val Acc: 0.9187 | Val 

Methods  
A custom convolutional neural network with three convolutional layers was implemented from scratch using PyTorch for Fashion-MNIST image classification. The dataset was split into training, validation, and test sets. The model was trained using the Adam optimizer and cross-entropy loss. Performance was evaluated using accuracy, precision, recall, F1-score, and a confusion matrix.  
  
Results    
Macro averaging computes precision, recall, and F1-score independently for each class and then averages them equally across classes. This approach ensures that performance on all classes contributes equally to the final metric, preventing dominant or easy classes from masking poor performance on harder classes. For Fashion-MNIST, where class frequencies are balanced but visual difficulty varies, macro-averaged metrics provide a fair and informative evaluation of model performance.  
  
The CNN achieved a test accuracy of 90.9% and a macro-averaged F1-score of 0.91. Strong performance was observed across most classes, with confusion primarily occurring between visually similar clothing categories such as T-shirts, pullovers, and shirts. Validation performance peaked around epoch 11â€“12, indicating optimal convergence without significant overfitting.