In [25]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [26]:
data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [27]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X_train_full = np.array(data)
X_train_full = X_train_full.T #samples now arranged as columns
Y_train_full = X_train_full[0]
Y_train_full.shape
X_train_full = X_train_full[1:] #purify data matrix by removing lables row
np.random.shuffle(X_train_full) #shuffle data before transpose and before splitting into train and test
n, m = X_train_full.shape
print(n, m)

784 42000


In [29]:
X_train_full

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   6, ..., 255, 199,   5],
       [  0,   0,   0, ...,   0,   0,   0]])

In [30]:
max_index = max(Y_train_full)
print(max_index)

9


In [31]:
def init_params(n_hidden=10, n_features=784, n_output=10):
    W1 = np.random.randn(n_hidden, n_features) * 0.01
    b1 = np.random.randn(n_hidden, 1) * 0.01
    W2 = np.random.randn(n_output, n_hidden) * 0.01
    b2 = np.random.randn(n_output, 1) * 0.01
    return W1, b1, W2, b2

In [32]:
def softmax(Z):
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)
    exp_Z = np.exp(Z_shifted)
    return exp_Z/np.sum(exp_Z, axis=0, keepdims=True)

In [33]:
def forwardpass(W1, b1, W2, b2, X):
    #layer 1 computations
    n_samples = X.shape[1]
    b1_expanded = np.repeat(b1, n_samples, axis=1)
    Z1 = np.dot(W1, X) + b1_expanded
    A1 = np.maximum(0, Z1) #1st layer nodes for each sample
    b2_expanded = np.repeat(b2, n_samples, axis=1)
    Z2 = np.dot(W2, A1) + b2_expanded
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

In [34]:
def backprop(W1, b1, W2, b2, X, Y, Z1, A1, Z2, A2):
    # dZ2 = A2.copy()
    # for i in range(n_samples):
    #     dZ2[Y[i],i] -= 1
    # dW2 = (1/n_samples)*np.dot(dZ2, A1.T)
    n_samples = X.shape[1]
    dZ2 = A2.copy()
    dZ2[Y, np.arange(n_samples)] -= 1
    dZ2 = (1/n_samples)*dZ2
    dW2 = np.dot(dZ2, A1.T)

    db2 = np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    ReLU_mask = (Z1 > 0).astype(int)
    dZ1 = dA1 * ReLU_mask
    dW1 = np.dot(dZ1, X.T)

    db1 = np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

In [35]:
def update_params(W1, b1, W2, b2, Z1, A1, Z2, A2, X, Y, learning_rate=0.01):
    dW1, db1, dW2, db2 = backprop(W1, b1, W2, b2, X, Y, Z1, A1, Z2, A2)
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2 
    b2 = b2 - learning_rate*db2
    return W1, b1, W2, b2

In [36]:
def cost(A2, Y):
    n_samples = Y.shape[0]
    return -np.sum(np.log(A2[Y, np.arange(n_samples)])) / n_samples

In [37]:
def final_predictions(A2):
    return np.argmax(A2, axis=0)

In [38]:
def model_probabilities(W1, b1, W2, b2, X):
    _, _, _, A2 = forwardpass(W1, b1, W2, b2, X)
    return A2

In [39]:
def accuracy(A2, Y):
    final_preds = final_predictions(A2)
    return np.sum(final_preds == Y) / Y.size

In [40]:
def validation_and_decent(learning_rate=0.01, n_splits=5, n_iterations=500):
    average_cost = 0
    total_folds = 0
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold_idx,(train_idx, val_idx) in enumerate(kf.split(X_train_full.T, Y_train_full)):
        #train_idx and val_idx are an array of indecies to help split this data.
        X_train_split, Y_train_split = X_train_full[:,train_idx], Y_train_full[train_idx]
        X_val_split, Y_val_split = X_train_full[:,val_idx], Y_train_full[val_idx]
        
        
        n_features, n_samples = X_train_split.shape
        n_hidden = 10
        n_output = 10
        W1, b1, W2, b2 = init_params(n_hidden=n_hidden, n_features=n_features, n_output=n_output)

        for _ in range(n_iterations):
            Z1, A1, Z2, A2 = forwardpass(W1, b1, W2, b2, X_train_split)
            W1, b1, W2, b2 = update_params(W1, b1, W2, b2, Z1, A1, Z2, A2, X_train_split, Y_train_split, learning_rate)

        
        _, _, _, A2_val = forwardpass(W1, b1, W2, b2, X_val_split)
        fold_cost = cost(A2_val, Y_val_split)
        fold_accuracy = accuracy(A2_val, Y_val_split)
        print(f"Fold {fold_idx} — Cost: {fold_cost:.4f}, Acc: {fold_accuracy:.4f}")
        average_cost += fold_cost
        total_folds += 1
    average_cost /= total_folds
    print(f"\nHyperparameter: learning_rate = {learning_rate}")
    print(f"Average Cost: {average_cost:.4f}")
    return average_cost
    

In [None]:
#main validation loop
learning_rates = [1e-5, 1e-4, 1e-3, 5e-3, 1e-2]
lowest_cost     = float("inf")
best_lr         = None

for lr in learning_rates:
    avg_cost = validation_and_decent(learning_rate=lr, n_splits=5, n_iterations=500)
    print(f"LR={lr:.5f} → Avg Cost={avg_cost:.4f}")
    if avg_cost < lowest_cost:
        lowest_cost = avg_cost
        best_lr     = lr

print(f"\nBest learning rate: {best_lr}  (Avg Cost={lowest_cost:.4f})")


Fold 0 — Cost: 2.2471, Acc: 0.1561
Fold 1 — Cost: 2.1969, Acc: 0.1700


In [None]:
print("For Test commit")