In [None]:
# ----------------------------------------------------
# Fashion MNIST Multi-Layer Neural Network from Scratch
# ----------------------------------------------------
# This script:
# 1. Loads the Fashion MNIST dataset from CSV
# 2. Splits into train/test sets per class
# 3. Preprocesses and normalizes data
# 4. Defines ReLU, Sigmoid, Softmax activations
# 5. Builds and trains a neural network with L2 regularization and label smoothing
# 6. Tracks and prints accuracy and loss across epochs
# ----------------------------------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# ----------------------------------------------------
# Load the CSV containing Fashion MNIST data
# ----------------------------------------------------
df = pd.read_csv("/kaggle/input/fashion-mnist-train-csv/fashion-mnist_train.csv")

# Group by class label to split manually
grouped = df.groupby("label")

train_list = []
test_list = []

# For each class, split 80% train / 20% test
for label, group in grouped:
    train_split, test_split = train_test_split(
        group,
        test_size=0.2,
        random_state=42,
        shuffle=True,
        stratify=None
    )
    train_list.append(train_split)
    test_list.append(test_split)

# Concatenate all splits and shuffle
train_df = pd.concat(train_list).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat(test_list).sample(frac=1, random_state=42).reset_index(drop=True)

# ----------------------------------------------------
# Visualize one example per class
# ----------------------------------------------------
examples = train_df.groupby("label").first().reset_index()
plt.figure(figsize=(10, 4))
for i in range(10):
    ax = plt.subplot(2, 5, i + 1)
    img = examples.loc[i].drop("label").values.astype(np.uint8).reshape(28, 28)
    plt.imshow(img, cmap="gray")
    plt.title(f"Label: {examples.loc[i, 'label']}")
    plt.axis("off")
plt.tight_layout()
plt.show()

# ----------------------------------------------------
# Prepare Train/Test Feature Matrices and Labels
# ----------------------------------------------------
X = train_df.drop("label", axis=1).values.astype(np.float32)
y = train_df["label"].values
num_classes = np.max(y) + 1
y = np.eye(num_classes)[y]  # One-hot encoding

X_test = test_df.drop("label", axis=1).values.astype(np.float32)
y_test = test_df["label"].values
y_test = np.eye(num_classes)[y_test]

# Normalize data
np.random.seed(0)
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std
X_test = (X_test - mean) / std

print("Shapes:", X.shape, y.shape, X_test.shape, y_test.shape)

# ----------------------------------------------------
# Activation Functions
# ----------------------------------------------------
def relu(x, grad):
    if grad:
        return (x > 0).astype(float)
    else:
        return np.maximum(0, x)

def sigmoid(x, grad):
    s = 1 / (1 + np.exp(-x))
    if grad:
        return s * (1 - s)
    else:
        return s

def softmax(z, grad):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

# ----------------------------------------------------
# Network Architecture and Initialization
# ----------------------------------------------------
arch = [128, 128, 10]
activations = [relu, relu, softmax]
W = []
B = []

alpha = 0.002
batch = 32
lmbda = 0.1
best_test = 0

def label_smoothing(y_true, epsilon=0.1):
    K = y_true.shape[1]
    return (1 - epsilon) * y_true + epsilon / K

# He initialization
for i in range(len(arch)):
    if i == 0:
        w = np.random.randn(X.shape[1], arch[i]) * np.sqrt(2. / X.shape[1])
    else:
        w = np.random.randn(arch[i - 1], arch[i]) * np.sqrt(2. / arch[i - 1])
    b = np.zeros((1, arch[i]))
    W.append(w)
    B.append(b)

# ----------------------------------------------------
# Training Loop
# ----------------------------------------------------
whole_accuracy = []
whole_accuracy_test = []
whole_cost = []
whole_cost_test = []

for e in range(600):
    all_accuracy = []
    all_cost = []

    # Training batches
    for i in range(int(np.ceil(len(X) / batch))):
        X_batch = X[batch * i : batch * (i + 1)]
        y_batch = y[batch * i : batch * (i + 1)]
        m_batch = X_batch.shape[0]
        y_smooth = label_smoothing(y_batch, epsilon=0.1)

        # Forward pass
        A = X_batch
        all_A = []
        all_Z = []
        for j in range(len(W)):
            Z = A @ W[j] + B[j]
            A = activations[j](Z, grad=False)
            all_Z.append(Z)
            all_A.append(A)

        # Cost
        cost = (-1 / m_batch) * np.sum(y_smooth * np.log(A + 1e-8))
        cost += (lmbda / (2 * m_batch)) * sum([np.sum(w ** 2) for w in W])
        all_cost.append(cost)

        # Accuracy
        y_pred = np.argmax(A, axis=1)
        y_true = np.argmax(y_smooth, axis=1)
        accuracy = np.mean(y_pred == y_true) * 100
        all_accuracy.append(accuracy)

        # Backpropagation
        for j in reversed(range(len(W))):
            if j == len(W) - 1:
                dz = all_A[j] - y_smooth
            else:
                dz = (dz @ W[j + 1].T) * activations[j](all_Z[j], grad=True)
            dw = X_batch.T @ dz if j == 0 else all_A[j - 1].T @ dz
            W[j] -= (alpha / m_batch) * (dw + lmbda * W[j])
            B[j] -= (alpha / m_batch) * np.sum(dz, axis=0, keepdims=True)

    # Evaluation on test set
    all_accuracy_test = []
    all_cost_test = []
    for k in range(int(np.ceil(len(X_test) / batch))):
        X_test_batch = X_test[batch * k : batch * (k + 1)]
        y_test_batch = y_test[batch * k : batch * (k + 1)]
        m_test = X_test_batch.shape[0]

        A = X_test_batch
        for j in range(len(W)):
            Z = A @ W[j] + B[j]
            A = activations[j](Z, grad=False)

        cost_test = (-1 / m_test) * np.sum(y_test_batch * np.log(A + 1e-8))
        cost_test += (lmbda / (2 * m_test)) * sum([np.sum(w ** 2) for w in W])

        y_pred_test = np.argmax(A, axis=1)
        y_true_test = np.argmax(y_test_batch, axis=1)
        accuracy_test = np.mean(y_pred_test == y_true_test) * 100

        all_accuracy_test.append(accuracy_test)
        all_cost_test.append(cost_test)

    # Record stats
    mean_acc = np.mean(all_accuracy)
    mean_acc_test = np.mean(all_accuracy_test)
    mean_cost = np.mean(all_cost)
    mean_cost_test = np.mean(all_cost_test)

    whole_accuracy.append(mean_acc)
    whole_accuracy_test.append(mean_acc_test)
    whole_cost.append(mean_cost)
    whole_cost_test.append(mean_cost_test)

    # Save best model
    if best_test < mean_acc_test:
        best_test = mean_acc_test
        for k in range(len(W)):
            np.save(f"/kaggle/working/W{k}.npy", W[k])
            np.save(f"/kaggle/working/B{k}.npy", B[k])

    # Print progress
    print(
        f"Epoch {e} | Best Test Acc: {best_test:.2f}% "
        f"| Train Acc: {mean_acc:.2f}% | Test Acc: {mean_acc_test:.2f}% "
        f"| Train Cost: {mean_cost:.4f} | Test Cost: {mean_cost_test:.4f}"
    )

