In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import cv2
import random

# -------------------------------
# Load the Fashion MNIST CSV file
# -------------------------------
df = pd.read_csv("/kaggle/input/fashion-mnist-train-csv/fashion-mnist_train.csv")

# Group the dataset by label (0–9)
grouped = df.groupby("label")

# Lists to hold splits per class
train_list = []
test_list = []

# -------------------------------
# Split each class into 80% train and 20% test
# -------------------------------
for label, group in grouped:
    train_split, test_split = train_test_split(
        group,
        test_size=0.2,
        random_state=42,
        shuffle=True,
        stratify=None  # No further stratification needed
    )
    train_list.append(train_split)
    test_list.append(test_split)

# Concatenate all train and test splits
train_df = pd.concat(train_list).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat(test_list).sample(frac=1, random_state=42).reset_index(drop=True)

# -------------------------------
# Visualize one example per class
# -------------------------------
examples = train_df.groupby("label").first().reset_index()

plt.figure(figsize=(10, 4))
for i in range(10):
    ax = plt.subplot(2, 5, i + 1)
    img = examples.loc[i].drop("label").values.astype(np.uint8).reshape(28, 28)
    plt.imshow(img, cmap="gray")
    plt.title(f"Label: {examples.loc[i, 'label']}")
    plt.axis("off")
plt.tight_layout()
plt.show()

# -------------------------------
# Prepare training and testing data
# -------------------------------
X = train_df.drop("label", axis=1).values.astype(np.float32)
y = train_df["label"].values
num_classes = np.max(y) + 1
y = np.eye(num_classes)[y]  # One-hot encode labels

X_test = test_df.drop("label", axis=1).values.astype(np.float32)
y_test = test_df["label"].values
y_test = np.eye(num_classes)[y_test]

# -------------------------------
# Normalize inputs
# -------------------------------
np.random.seed(0)
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std
X_test = (X_test - mean) / std

print(X.shape, y.shape, X_test.shape, y_test.shape)

# -------------------------------
# Activation functions
# -------------------------------
def relu(x, grad):
    if grad:
        return (x > 0).astype(float)
    else:
        return np.maximum(0, x)

def sigmoid(x, grad):
    s = 1 / (1 + np.exp(-x))
    if grad:
        return s * (1 - s)
    else:
        return s

def softmax(z, grad):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

# -------------------------------
# Define network architecture
# -------------------------------
arch = [300, 300, 10]   # Two hidden layers (300 neurons) + output layer (10)
activations = [relu, relu, softmax]

W = []   # Weights
B = []   # Biases

# Hyperparameters
alpha = 0.002      # Learning rate
batch = 32         # Mini-batch size
best_test = 0      # Track best test accuracy
lmbda = 0.001      # L2 regularization factor

# -------------------------------
# Data augmentation function
# -------------------------------
def augmentation(img):
    # Random horizontal flip
    if np.random.rand() < 0.5:
        img = np.fliplr(img)
    # Random rotation between -15 and +15 degrees
    if np.random.rand() < 0.5:
        angle = random.choice(np.arange(-15, 15))
        M = cv2.getRotationMatrix2D((14, 14), angle, 1.0)
        img = cv2.warpAffine(img, M, (28, 28), borderMode=cv2.BORDER_REFLECT)
    return np.asarray(img)

# -------------------------------
# Label smoothing function
# -------------------------------
def label_smoothing(y_true, epsilon=0.1):
    K = y_true.shape[1]
    return (1 - epsilon) * y_true + epsilon / K

# -------------------------------
# He initialization of weights
# -------------------------------
for i in range(len(arch)):
    if i == 0:
        w = np.random.randn(X.shape[1], arch[i]) * np.sqrt(2. / X.shape[1])
    else:
        w = np.random.randn(arch[i - 1], arch[i]) * np.sqrt(2. / arch[i - 1])
    b = np.zeros((1, arch[i]))
    W.append(w)
    B.append(b)

# -------------------------------
# Training loop over epochs
# -------------------------------
whole_accuracy = []
whole_accuracy_test = []
whole_cost = []
whole_cost_test = []

for e in range(600):
    all_accuracy = []
    all_cost = []
    all_accuracy_test = []
    all_cost_test = []

    # -------------------------------
    # Mini-batch training
    # -------------------------------
    for i in range(int(np.ceil(len(X) / batch))):
        X_batch = X[batch * i : batch * (i + 1)]
        y_batch = y[batch * i : batch * (i + 1)]
        m_batch = X_batch.shape[0]

        # Apply label smoothing
        y_smooth = label_smoothing(y_batch, epsilon=0.1)

        # Apply augmentation
        X_image = X_batch.reshape(-1, 28, 28)
        X_aug = np.array([augmentation(img) for img in X_image])
        X_aug_flat = X_aug.reshape(-1, 784)

        # Forward pass
        A = X_aug_flat
        all_A = []
        all_Z = []
        for j in range(len(W)):
            Z = A @ W[j] + B[j]
            A = activations[j](Z, grad=False)
            all_A.append(A)
            all_Z.append(Z)

        # Compute cost with L2 regularization
        cost = (-1 / m_batch) * np.sum(y_smooth * np.log(A + 1e-8))
        cost += (lmbda / (2 * m_batch)) * sum([np.sum(w ** 2) for w in W])
        all_cost.append(cost)

        # Compute training accuracy
        y_pred = np.argmax(A, axis=1)
        y_true = np.argmax(y_smooth, axis=1)
        accuracy = np.mean(y_pred == y_true) * 100
        all_accuracy.append(accuracy)

        # Backpropagation
        for j in reversed(range(len(W))):
            if j == len(W) - 1:
                dz = all_A[j] - y_smooth
            else:
                dz = (dz @ W[j + 1].T) * activations[j](all_Z[j], grad=True)

            if j == 0:
                dw = X_aug_flat.T @ dz
            else:
                dw = all_A[j - 1].T @ dz

            W[j] -= (alpha / m_batch) * (dw + lmbda * W[j])
            B[j] -= (alpha / m_batch) * np.sum(dz, axis=0, keepdims=True)

    # -------------------------------
    # Evaluate on test set
    # -------------------------------
    for k in range(int(np.ceil(len(X_test) / batch))):
        X_test_batch = X_test[batch * k : batch * (k + 1)]
        y_test_batch = y_test[batch * k : batch * (k + 1)]
        m_test = X_test_batch.shape[0]

        # Forward pass
        A = X_test_batch
        for j in range(len(W)):
            Z = A @ W[j] + B[j]
            A = activations[j](Z, grad=False)

        # Compute test cost
        cost_test = (-1 / m_test) * np.sum(y_test_batch * np.log(A + 1e-8))
        cost_test += (lmbda / (2 * m_test)) * sum([np.sum(w ** 2) for w in W])
        all_cost_test.append(cost_test)

        # Compute test accuracy
        y_pred_test = np.argmax(A, axis=1)
        y_true_test = np.argmax(y_test_batch, axis=1)
        accuracy_test = np.mean(y_pred_test == y_true_test) * 100
        all_accuracy_test.append(accuracy_test)

    # -------------------------------
    # Record epoch statistics
    # -------------------------------
    mean_acc_train = np.mean(all_accuracy)
    mean_acc_test = np.mean(all_accuracy_test)
    mean_cost_train = np.mean(all_cost)
    mean_cost_test = np.mean(all_cost_test)

    whole_accuracy.append(mean_acc_train)
    whole_accuracy_test.append(mean_acc_test)
    whole_cost.append(mean_cost_train)
    whole_cost_test.append(mean_cost_test)

    # Save model if test accuracy improves
    if best_test < mean_acc_test:
        best_test = mean_acc_test
        for idx in range(len(W)):
            np.save(f"/kaggle/working/W{idx}.npy", W[idx])
            np.save(f"/kaggle/working/B{idx}.npy", B[idx])

    # Print progress
    print(
        f"Epoch {e} | Best Test Acc: {best_test:.2f}% | "
        f"Train Acc: {mean_acc_train:.2f}% | Test Acc: {mean_acc_test:.2f}% | "
        f"Train Cost: {mean_cost_train:.4f} | Test Cost: {mean_cost_test:.4f}"
    )
