In [30]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [31]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [32]:
train_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X_train_full = np.array(train_data)
np.random.shuffle(X_train_full) #shuffle data before transpose and before splitting into train and test
X_train_full = X_train_full.T #samples now arranged as columns
Y_train_full = X_train_full[0]
Y_train_full.shape
X_train_full = X_train_full[1:] #purify data matrix by removing lables row
n, m = X_train_full.shape
print(n, m)

784 42000


In [34]:
X_test = np.array(test_data)
X_test = X_test.T #samples now arranged as columns

In [35]:
X_train_full

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(784, 42000))

In [36]:
max_index = max(Y_train_full)
print(max_index)

9


In [37]:
def init_params(n_hidden=10, n_features=784, n_output=10):
    W1 = np.random.randn(n_hidden, n_features) * 0.01
    b1 = np.random.randn(n_hidden, 1) * 0.01
    W2 = np.random.randn(n_output, n_hidden) * 0.01
    b2 = np.random.randn(n_output, 1) * 0.01
    return W1, b1, W2, b2

In [38]:
def softmax(Z):
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)
    exp_Z = np.exp(Z_shifted)
    return exp_Z/np.sum(exp_Z, axis=0, keepdims=True)

In [39]:
def forwardpass(W1, b1, W2, b2, X):
    #layer 1 computations
    n_samples = X.shape[1]
    b1_expanded = np.repeat(b1, n_samples, axis=1)
    Z1 = np.dot(W1, X) + b1_expanded
    A1 = np.maximum(0, Z1) #1st layer nodes for each sample
    b2_expanded = np.repeat(b2, n_samples, axis=1)
    Z2 = np.dot(W2, A1) + b2_expanded
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [40]:
def backprop(W1, b1, W2, b2, X, Y, Z1, A1, Z2, A2):
    # dZ2 = A2.copy()
    # for i in range(n_samples):
    #     dZ2[Y[i],i] -= 1
    # dW2 = (1/n_samples)*np.dot(dZ2, A1.T)
    n_samples = X.shape[1]
    dZ2 = A2.copy()
    dZ2[Y, np.arange(n_samples)] -= 1
    dZ2 = (1/n_samples)*dZ2
    dW2 = np.dot(dZ2, A1.T)

    db2 = np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    ReLU_mask = (Z1 > 0).astype(int)
    dZ1 = dA1 * ReLU_mask
    dW1 = np.dot(dZ1, X.T)

    db1 = np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

In [41]:
def update_params(W1, b1, W2, b2, Z1, A1, Z2, A2, X, Y, learning_rate=0.01):
    dW1, db1, dW2, db2 = backprop(W1, b1, W2, b2, X, Y, Z1, A1, Z2, A2)
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2 
    b2 = b2 - learning_rate*db2
    return W1, b1, W2, b2

In [42]:
def cost(A2, Y):
    n_samples = Y.shape[0]
    return -np.sum(np.log(A2[Y, np.arange(n_samples)])) / n_samples

In [43]:
def final_predictions(A2):
    return np.argmax(A2, axis=0)

In [44]:
def model_probabilities(W1, b1, W2, b2, X):
    _, _, _, A2 = forwardpass(W1, b1, W2, b2, X)
    return A2

In [45]:
def accuracy(A2, Y):
    final_preds = final_predictions(A2)
    return np.sum(final_preds == Y) / Y.size

In [46]:
def validation_and_decent(learning_rate=0.01, n_splits=5, n_iterations=500):
    average_cost = 0
    total_folds = 0
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold_idx,(train_idx, val_idx) in enumerate(kf.split(X_train_full.T, Y_train_full)):
        #train_idx and val_idx are an array of indecies to help split this data.
        X_train_split, Y_train_split = X_train_full[:,train_idx], Y_train_full[train_idx]
        X_val_split, Y_val_split = X_train_full[:,val_idx], Y_train_full[val_idx]
        
        
        n_features, n_samples = X_train_split.shape
        n_hidden = 10
        n_output = 10
        W1, b1, W2, b2 = init_params(n_hidden=n_hidden, n_features=n_features, n_output=n_output)

        for _ in range(n_iterations):
            Z1, A1, Z2, A2 = forwardpass(W1, b1, W2, b2, X_train_split)
            W1, b1, W2, b2 = update_params(W1, b1, W2, b2, Z1, A1, Z2, A2, X_train_split, Y_train_split, learning_rate)

        
        _, _, _, A2_val = forwardpass(W1, b1, W2, b2, X_val_split)
        fold_cost = cost(A2_val, Y_val_split)
        fold_accuracy = accuracy(A2_val, Y_val_split)
        print(f"Fold {fold_idx} — Cost: {fold_cost:.4f}, Acc: {fold_accuracy:.4f}")
        average_cost += fold_cost
        total_folds += 1
    average_cost /= total_folds
    print(f"\nHyperparameter: learning_rate = {learning_rate}")
    print(f"Average Cost: {average_cost:.4f}")
    return average_cost
    

In [47]:
#main validation loop
learning_rates = [1e-5, 1e-4, 1e-3, 5e-3, 1e-2]
lowest_cost     = float("inf")
best_lr         = None

for lr in learning_rates:
    avg_cost = validation_and_decent(learning_rate=lr, n_splits=5, n_iterations=500)
    print(f"LR={lr:.5f} → Avg Cost={avg_cost:.4f}")
    if avg_cost < lowest_cost:
        lowest_cost = avg_cost
        best_lr     = lr

print(f"\nBest learning rate: {best_lr}  (Avg Cost={lowest_cost:.4f})\n")


Fold 0 — Cost: 2.1381, Acc: 0.2099
Fold 1 — Cost: 2.0477, Acc: 0.2626
Fold 2 — Cost: 2.2112, Acc: 0.2144
Fold 3 — Cost: 2.1780, Acc: 0.2105
Fold 4 — Cost: 2.1404, Acc: 0.2202

Hyperparameter: learning_rate = 1e-05
Average Cost: 2.1431
LR=0.00001 → Avg Cost=2.1431
Fold 0 — Cost: 0.8134, Acc: 0.7485
Fold 1 — Cost: 0.7524, Acc: 0.7749
Fold 2 — Cost: 0.8303, Acc: 0.7395
Fold 3 — Cost: 0.6994, Acc: 0.8007
Fold 4 — Cost: 0.7484, Acc: 0.7717

Hyperparameter: learning_rate = 0.0001
Average Cost: 0.7688
LR=0.00010 → Avg Cost=0.7688
Fold 0 — Cost: 0.3329, Acc: 0.9067
Fold 1 — Cost: 0.3433, Acc: 0.9037
Fold 2 — Cost: 0.3204, Acc: 0.9083
Fold 3 — Cost: 0.3408, Acc: 0.9037
Fold 4 — Cost: 0.3454, Acc: 0.8969

Hyperparameter: learning_rate = 0.001
Average Cost: 0.3366
LR=0.00100 → Avg Cost=0.3366
Fold 0 — Cost: 1.6479, Acc: 0.3568
Fold 1 — Cost: 0.6815, Acc: 0.7785
Fold 2 — Cost: 2.0095, Acc: 0.2039
Fold 3 — Cost: 0.4153, Acc: 0.8739
Fold 4 — Cost: 1.6557, Acc: 0.3783

Hyperparameter: learning_rate =

In [48]:
def final_train(learning_rate=0.01, n_iterations=500):
    n_features, n_samples = X_train_full.shape
    n_hidden  = 10
    n_output  = 10

    W1, b1, W2, b2 = init_params(
        n_hidden=n_hidden,
        n_features=n_features,
        n_output=n_output
    )

    for _ in range(n_iterations):
        Z1, A1, Z2, A2 = forwardpass(W1, b1, W2, b2, X_train_full)
        W1, b1, W2, b2 = update_params(
            W1, b1, W2, b2,
            Z1, A1, Z2, A2,
            X_train_full, Y_train_full,
            learning_rate
        )

    # Evaluate on the same training data
    _, _, _, A2_full = forwardpass(W1, b1, W2, b2, X_train_full)
    full_cost       = cost(A2_full, Y_train_full)
    full_accuracy   = accuracy(A2_full, Y_train_full)

    print(f"Train Cost: {full_cost:.4f}, Train Acc: {full_accuracy:.4f}\n")
    return W1, b1, W2, b2

In [49]:
W1, b1, W2, b2 = final_train(learning_rate=best_lr)

Train Cost: 0.3109, Train Acc: 0.9101



In [50]:
# _, _, _, A2_final = forwardpass(W1, b1, W2, b2, X_test)
# final_cost = cost(A2_final, Y_test)
# final_accuracy = accuracy(A2_final, Y_test)
# print(f"Test Cost: {final_cost:.4f}, Test Acc: {final_accuracy:.4f}\n")

In [51]:
X_test.shape

(784, 28000)

In [53]:
_, _, _, A2_test = forwardpass(W1, b1, W2, b2, X_test)
preds = final_predictions(A2_test)           # shape (28000,)
submission = pd.DataFrame({
    "ImageId": np.arange(1, len(preds)+1),
    "Label": preds
})
submission.to_csv("submission.csv", index=False)