In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

# ---------------------------------
# Load Fashion MNIST CSV
# ---------------------------------
df = pd.read_csv("/kaggle/input/fashion-mnist-train-csv/fashion-mnist_train.csv")

# Group by label to split each class separately
grouped = df.groupby("label")

train_list = []
test_list = []

# ---------------------------------
# Split each class into train/test
# ---------------------------------
for label, group in grouped:
    train_split, test_split = train_test_split(
        group,
        test_size=0.2,
        random_state=42,
        shuffle=True,
        stratify=None  # already grouped
    )
    train_list.append(train_split)
    test_list.append(test_split)

# Combine splits and shuffle
train_df = pd.concat(train_list).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat(test_list).sample(frac=1, random_state=42).reset_index(drop=True)

# ---------------------------------
# Plot one example per class
# ---------------------------------
examples = train_df.groupby("label").first().reset_index()
plt.figure(figsize=(10, 4))
for i in range(10):
    ax = plt.subplot(2, 5, i + 1)
    img = examples.loc[i].drop("label").values.astype(np.uint8).reshape(28, 28)
    plt.imshow(img, cmap="gray")
    plt.title(f"Label: {examples.loc[i, 'label']}")
    plt.axis("off")
plt.tight_layout()
plt.show()

# ---------------------------------
# Prepare training and test data
# ---------------------------------
X = train_df.drop("label", axis=1).values.astype(np.float32)
y = train_df["label"].values
num_classes = np.max(y) + 1
y = np.eye(num_classes)[y]

X_test = test_df.drop("label", axis=1).values.astype(np.float32)
y_test = test_df["label"].values
y_test = np.eye(num_classes)[y_test]

# Normalize inputs
np.random.seed(0)
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std
X_test = (X_test - mean) / std

print(X.shape, y.shape, X_test.shape, y_test.shape)

# ---------------------------------
# Activation functions
# ---------------------------------
def relu(x, grad):
    if grad:
        return (x > 0).astype(float)
    else:
        return np.maximum(0, x)

def sigmoid(x, grad):
    s = 1 / (1 + np.exp(-x))
    if grad:
        return s * (1 - s)
    else:
        return s

def softmax(z, grad):
    exps = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exps / np.sum(exps, axis=1, keepdims=True)

# ---------------------------------
# Precision and Recall function
# ---------------------------------
def precision_recall(y_true, y_pred):
    # Compute per-class true/false positives
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    precision = TP / (TP + FP + 1e-8)
    recall = TP / (TP + FN + 1e-8)
    return precision, recall

# ---------------------------------
# Network architecture
# ---------------------------------
arch = [300, 300, 10]
activations = [relu, relu, softmax]

W = []
B = []

alpha = 0.002
batch = 32
best_test = 0

# He initialization
for i in range(len(arch)):
    if i == 0:
        w = np.random.randn(X.shape[1], arch[i]) * np.sqrt(2. / X.shape[1])
    else:
        w = np.random.randn(arch[i-1], arch[i]) * np.sqrt(2. / arch[i-1])
    b = np.zeros((1, arch[i]))
    W.append(w)
    B.append(b)

# ---------------------------------
# Training loop
# ---------------------------------
whole_accuracy = []
whole_accuracy_test = []
whole_cost = []
whole_cost_test = []

for e in range(600):
    all_accuracy = []
    all_cost = []
    all_precision = []
    all_recall = []
    all_accuracy_test = []
    all_cost_test = []

    # Mini-batch training
    for i in range(int(np.ceil(len(X) / batch))):
        X_batch = X[batch * i : batch * (i + 1)]
        y_batch = y[batch * i : batch * (i + 1)]
        m_batch = X_batch.shape[0]

        # Forward pass
        A = X_batch
        all_A = []
        all_Z = []
        for j in range(len(W)):
            Z = A @ W[j] + B[j]
            A = activations[j](Z, grad=False)
            all_A.append(A)
            all_Z.append(Z)

        # Compute cross-entropy loss
        cost = (-1 / m_batch) * np.sum(y_batch * np.log(A + 1e-8))
        all_cost.append(cost)

        # Compute accuracy
        y_pred = np.argmax(A, axis=1)
        y_pred_oh = np.eye(num_classes)[y_pred]
        y_true = np.argmax(y_batch, axis=1)
        accuracy = np.mean(y_pred == y_true) * 100
        all_accuracy.append(accuracy)

        # Compute precision and recall
        precision, recall = precision_recall(y_batch, y_pred_oh)
        all_precision.append(precision)
        all_recall.append(recall)

        # Backpropagation
        for j in reversed(range(len(W))):
            if j == len(W) - 1:
                dz = all_A[j] - y_batch
            else:
                dz = (dz @ W[j+1].T) * activations[j](all_Z[j], grad=True)

            dw = X_batch.T @ dz if j == 0 else all_A[j-1].T @ dz
            W[j] -= (alpha / m_batch) * dw
            B[j] -= (alpha / m_batch) * np.sum(dz, axis=0, keepdims=True)

    # Evaluate on test set
    for i_batch in range(int(np.ceil(len(X_test) / batch))):
        X_test_batch = X_test[batch * i_batch : batch * (i_batch + 1)]
        y_test_batch = y_test[batch * i_batch : batch * (i_batch + 1)]
        m_test = X_test_batch.shape[0]

        A = X_test_batch
        for j in range(len(W)):
            Z = A @ W[j] + B[j]
            A = activations[j](Z, grad=False)

        cost_test = (-1 / m_test) * np.sum(y_test_batch * np.log(A + 1e-8))
        all_cost_test.append(cost_test)

        y_pred_test = np.argmax(A, axis=1)
        y_true_test = np.argmax(y_test_batch, axis=1)
        accuracy_test = np.mean(y_pred_test == y_true_test) * 100
        all_accuracy_test.append(accuracy_test)

    # Log metrics
    mean_acc_train = np.mean(all_accuracy)
    mean_acc_test = np.mean(all_accuracy_test)
    mean_cost_train = np.mean(all_cost)
    mean_cost_test = np.mean(all_cost_test)
    mean_precision = np.mean(all_precision)
    mean_recall = np.mean(all_recall)

    whole_accuracy.append(mean_acc_train)
    whole_accuracy_test.append(mean_acc_test)
    whole_cost.append(mean_cost_train)
    whole_cost_test.append(mean_cost_test)

    # Save best weights
    if best_test < mean_acc_test:
        best_test = mean_acc_test
        for k in range(len(W)):
            np.save(f"/kaggle/working/W{k}.npy", W[k])
            np.save(f"/kaggle/working/B{k}.npy", B[k])

    # Print progress
    print(
        f"epochs: {e} | best_test {best_test:.2f} | "
        f"accuracy_train: {mean_acc_train:.2f} | accuracy_test: {mean_acc_test:.2f} | "
        f"cost_train: {mean_cost_train:.4f} | cost_test: {mean_cost_test:.4f} | "
        f"precision: {mean_precision:.4f} | recall: {mean_recall:.4f}"
    )
