In [None]:
# ==========================================================
# Import required libraries
# ==========================================================
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import cv2

# ==========================================================
# Load Fashion MNIST dataset (CSV format)
# ==========================================================
df = pd.read_csv("/kaggle/input/fashion-mnist-train-csv/fashion-mnist_train.csv")

# ==========================================================
# Split the dataset into training and testing subsets
# using stratified splitting per class
# ==========================================================
grouped = df.groupby("label")
train_list = []
test_list = []

for label, group in grouped:
    train_split, test_split = train_test_split(
        group, 
        test_size=0.2, 
        random_state=42, 
        shuffle=True,
        stratify=None  # Manual stratification by class
    )
    train_list.append(train_split)
    test_list.append(test_split)

# Concatenate splits and shuffle
train_df = pd.concat(train_list).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat(test_list).sample(frac=1, random_state=42).reset_index(drop=True)

# ==========================================================
# Display one example image per class
# ==========================================================
examples = train_df.groupby("label").first().reset_index()
plt.figure(figsize=(10, 4))
for i in range(10):
    ax = plt.subplot(2, 5, i + 1)
    img = examples.loc[i].drop("label").values.astype(np.uint8).reshape(28, 28)
    plt.imshow(img, cmap="gray")
    plt.title(f"Label: {examples.loc[i, 'label']}")
    plt.axis("off")
plt.tight_layout()
plt.show()

# ==========================================================
# Prepare training and test data
# ==========================================================
X = train_df.drop("label", axis=1).values.astype(np.float32)
y = train_df["label"].values
num_classes = np.max(y) + 1
y = np.eye(num_classes)[y]  # One-hot encoding

X_test = test_df.drop("label", axis=1).values.astype(np.float32)
y_test = test_df["label"].values
y_test = np.eye(num_classes)[y_test]

# ==========================================================
# Standardize the data (zero mean, unit variance)
# ==========================================================
np.random.seed(0)
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std
X_test = (X_test - mean) / std  # Use same mean/std as training

print(X.shape, y.shape, X_test.shape, y_test.shape)

# ==========================================================
# Activation functions with optional gradient computation
# ==========================================================
def relu(x, grad):
    if grad:
        return (x > 0).astype(float)
    else:
        return np.maximum(0, x)

def sigmoid(x, grad):
    s = 1 / (1 + np.exp(-x))
    if grad:
        return s * (1 - s)
    else:
        return s

def softmax(z, grad):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

# ==========================================================
# Define neural network architecture
# ==========================================================
arch = [300, 300, 10]
activations = [relu, relu, softmax]

# Initialize parameters
W = []
B = []

alpha = 0.002          # Learning rate
batch = 32             # Batch size
best_test = 0          # Best test accuracy observed
lmbda = 0.001          # L2 regularization strength

# ==========================================================
# Data augmentation function
# Random flipping and rotation
# ==========================================================
def augmentation(img):
    if np.random.rand() < 0.5:
        img = np.fliplr(img)
    if np.random.rand() < 0.5:
        angle = np.random.uniform(-15, 15)
        M = cv2.getRotationMatrix2D((14, 14), angle, 1.0)
        img = cv2.warpAffine(img, M, (28, 28), borderMode=cv2.BORDER_REFLECT)
    return img

# ==========================================================
# Optional label smoothing function (not used here)
# ==========================================================
def label_smoothing(y_one_hot, epsilon=0.1):
    K = y_one_hot.shape[1]
    return (1 - epsilon) * y_one_hot + epsilon / K

# ==========================================================
# Initialize weights and biases with He initialization
# ==========================================================
for i in range(len(arch)):
    if i == 0:
        w = np.random.randn(X.shape[1], arch[i]) * np.sqrt(2. / X.shape[1])
    else:
        w = np.random.randn(arch[i-1], arch[i]) * np.sqrt(2. / arch[i-1])
    b = np.zeros((1, arch[i]))
    W.append(w)
    B.append(b)

# ==========================================================
# Training loop
# ==========================================================
whole_accuracy = []
whole_accuracy_test = []
whole_cost = []
whole_cost_test = []

for e in range(600):  # Number of epochs
    all_accuracy = []
    all_accuracy_test = []
    all_cost = []
    all_cost_test = []
    
    # Training batches
    for i in range(int(np.ceil(len(X) / batch))):
        X_batch = X[batch * i : batch * (i + 1)]
        y_batch = y[batch * i : batch * (i + 1)]
        
        # Data augmentation
        X_image = X_batch.reshape(-1, 28, 28)
        X_aug = np.array([augmentation(img) for img in X_image])
        X_aug_flat = X_aug.reshape(-1, 784)
        
        # Forward pass
        A = X_aug_flat
        all_A = []
        all_Z = []
        m_batch = X_aug_flat.shape[0]
        
        for i in range(len(W)):
            Z = A @ W[i] + B[i]
            A = activations[i](Z, grad=False)
            all_A.append(A)
            all_Z.append(Z)
        
        # Compute cost with L2 regularization
        cost = (-1 / m_batch) * np.sum(y_batch * np.log(A + 1e-8))
        cost += (lmbda / (2 * m_batch)) * sum([np.sum(w ** 2) for w in W])
        all_cost.append(cost)
        
        # Compute accuracy
        y_pred = np.argmax(A, axis=1)
        y_true = np.argmax(y_batch, axis=1)
        accuracy = np.mean(y_pred == y_true) * 100
        all_accuracy.append(accuracy)
        
        # Backward pass and parameter update
        for i in range(len(W) - 1, -1, -1):
            if i == len(W) - 1:
                dz = all_A[i] - y_batch
            else:
                dz = (dz @ W[i + 1].T) * activations[i](all_Z[i], grad=True)

            if i == 0:
                dw = X_aug_flat.T @ dz
            else:
                dw = all_A[i - 1].T @ dz

            # Gradient descent step with L2 regularization
            W[i] -= (alpha / m_batch) * (dw + lmbda * W[i])
            B[i] -= (alpha / m_batch) * np.sum(dz, axis=0, keepdims=True)
    
    # Validation batches
    for i_batch in range(int(np.ceil(len(X_test) / batch))):
        X_test_batch = X_test[batch * i_batch : batch * (i_batch + 1)]
        y_test_batch = y_test[batch * i_batch : batch * (i_batch + 1)]
        A = X_test_batch
        m_test = X_test_batch.shape[0]
        
        for i in range(len(W)):
            Z = A @ W[i] + B[i]
            A = activations[i](Z, grad=False)
        
        cost_test = (-1 / m_test) * np.sum(y_test_batch * np.log(A + 1e-8))
        cost_test += (lmbda / (2 * m_test)) * sum([np.sum(w ** 2) for w in W])
        all_cost_test.append(cost_test)
        
        y_pred_test = np.argmax(A, axis=1)
        y_true_test = np.argmax(y_test_batch, axis=1)
        accuracy_test = np.mean(y_pred_test == y_true_test) * 100
        all_accuracy_test.append(accuracy_test)

    # Track epoch results
    whole_accuracy.append(np.mean(all_accuracy))
    whole_accuracy_test.append(np.mean(all_accuracy_test))
    whole_cost.append(np.mean(all_cost))
    whole_cost_test.append(np.mean(all_cost_test))

    # Save best model weights
    if best_test < np.mean(all_accuracy_test):
        best_test = np.mean(all_accuracy_test)
        for k in range(len(W)):
            np.save(f"/kaggle/working/W{k}.npy", W[k])
            np.save(f"/kaggle/working/B{k}.npy", B[k])
    
    print(f"epochs: {e} | best_test: {best_test:.2f}% | accuracy_train: {np.mean(all_accuracy):.2f}% | accuracy_test: {np.mean(all_accuracy_test):.2f}% | cost_train: {np.mean(all_cost):.4f} | cost_test: {np.mean(all_cost_test):.4f}")
