In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [44]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into dev and training sets

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape

In [60]:
def init_params():
    W1 = np.random.randn(10, 784) * np.sqrt(2 / 784)
    W2 = np.random.randn(10, 10) * np.sqrt(2 / 10)
    b1 = np.zeros((10, 1))
    b2 = np.zeros((10, 1))
    return W1, b1, W2, b2

def LReLu(Z, alpha=0.01):
    return np.maximum(alpha * Z, Z)

def softmax(Z):
    Z -= np.max(Z, axis=0, keepdims=True)  # Subtract the max value for numerical stability
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0, keepdims=True)
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = LReLu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def deriv_LReLu(Z, alpha=0.1):
    return np.where(Z > 0, 1, alpha)

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    m = X.shape[1]
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = W2.T.dot(dZ2) * deriv_LReLu(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2

In [59]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def adam_update(W, b, dW, db, mW, mb, vW, vb, t, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
    t += 1
    # Update biased first moment estimate
    mW = beta1 * mW + (1 - beta1) * dW
    mb = beta1 * mb + (1 - beta1) * db
    
    # Update biased second moment estimate
    vW = beta2 * vW + (1 - beta2) * dW**2
    vb = beta2 * vb + (1 - beta2) * db**2
    
    # Compute bias-corrected first moment estimate
    mW_hat = mW / (1 - beta1**t)
    mb_hat = mb / (1 - beta1**t)
    
    # Compute bias-corrected second moment estimate
    vW_hat = vW / (1 - beta2**t)
    vb_hat = vb / (1 - beta2**t)
    
    # Update parameters
    W -= learning_rate * mW_hat / (np.sqrt(vW_hat) + epsilon)
    b -= learning_rate * mb_hat / (np.sqrt(vb_hat) + epsilon)
    
    return W, b, mW, mb, vW, vb, t

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    # using adam instead of constant alpha to improve calculations
    mW1, mb1, mW2, mb2 = np.zeros_like(W1), np.zeros_like(b1), np.zeros_like(W2), np.zeros_like(b2)
    vW1, vb1, vW2, vb2 = np.zeros_like(W1), np.zeros_like(b1), np.zeros_like(W2), np.zeros_like(b2)
    t = 0
    
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)

        W1, b1, mW1, mb1, vW1, vb1, t = adam_update(W1, b1, dW1, db1, mW1, mb1, vW1, vb1, t, alpha)
        W2, b2, mW2, mb2, vW2, vb2, t = adam_update(W2, b2, dW2, db2, mW2, mb2, vW2, vb2, t, alpha)
        
        if i % 10 == 0:
            predictions = get_predictions(A2)
            print(f"Iteration {i}, Accuracy: {get_accuracy(predictions, Y)}")
            
    return W1, b1, W2, b2

In [61]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.10, 500)

[4 7 7 ... 7 9 9] [2 1 0 ... 4 7 7]
Iteration 0, Accuracy: 0.07870731707317073
[2 1 6 ... 9 7 7] [2 1 0 ... 4 7 7]
Iteration 10, Accuracy: 0.5822926829268292
[2 1 8 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 20, Accuracy: 0.7373414634146341
[2 1 8 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 30, Accuracy: 0.8120975609756098
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 40, Accuracy: 0.8206097560975609
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 50, Accuracy: 0.8512682926829268
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 60, Accuracy: 0.8628536585365854
[2 1 0 ... 9 7 7] [2 1 0 ... 4 7 7]
Iteration 70, Accuracy: 0.8798292682926829
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 80, Accuracy: 0.8897804878048781
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 90, Accuracy: 0.8959024390243903
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 100, Accuracy: 0.9012682926829269
[2 1 0 ... 4 7 7] [2 1 0 ... 4 7 7]
Iteration 110, Accuracy: 0.7551707317073171
[2 1 0 ... 9 7 7] [2 1 0 ... 4 7 7]
Iteration 120,

In [62]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions
dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)
get_accuracy(dev_predictions, Y_dev)

[3 5 6 0 2 2 6 3 6 3 2 4 4 6 7 5 4 7 5 7 3 3 4 7 8 5 2 9 5 8 1 7 0 6 6 1 3
 8 7 9 7 1 4 2 0 9 9 8 1 6 8 1 3 3 7 7 0 5 5 1 1 6 8 1 1 8 1 2 3 8 4 0 2 9
 1 1 0 7 0 6 9 4 0 3 5 9 1 8 4 2 1 4 2 5 0 9 6 5 3 8 6 6 1 4 0 6 9 2 0 9 0
 8 0 0 4 6 0 3 3 1 4 9 9 3 2 3 0 4 1 2 8 5 2 4 2 6 9 0 9 8 3 4 5 7 3 4 1 1
 7 7 0 0 7 9 2 8 1 5 1 8 3 1 3 2 8 0 6 7 7 4 2 6 5 1 5 7 8 3 7 3 2 9 5 2 3
 6 2 2 1 2 3 0 6 4 3 4 5 3 9 0 4 7 5 9 0 4 1 1 2 5 8 1 0 7 4 2 6 9 5 7 1 3
 8 9 6 0 0 4 2 4 7 1 5 9 5 0 3 8 7 1 9 7 2 2 9 2 7 9 0 2 4 6 5 4 3 0 5 5 5
 7 8 3 1 4 2 4 6 9 3 7 1 1 8 7 6 7 7 6 5 8 9 4 9 6 5 0 1 5 1 6 1 4 5 8 7 7
 4 4 4 3 3 0 1 8 0 1 9 6 1 1 5 3 6 2 5 4 2 8 9 5 2 1 0 3 3 0 3 6 1 9 6 8 3
 9 9 8 4 7 0 2 2 8 2 0 2 8 5 9 1 9 0 7 0 2 6 1 2 4 7 2 7 3 2 1 1 1 5 9 1 3
 3 6 9 4 6 9 9 8 2 4 2 3 3 7 2 3 3 9 1 6 1 2 1 1 0 6 2 1 3 0 7 1 6 7 9 3 2
 0 7 6 5 4 0 1 3 2 9 2 2 1 5 9 0 9 8 3 6 1 1 4 1 1 9 7 9 4 8 0 9 0 8 0 4 8
 7 2 7 1 1 3 1 2 7 5 9 7 8 0 6 2 4 8 3 0 2 5 5 4 9 8 9 1 9 0 8 8 9 1 3 5 0
 2 0 9 2 1 4 8 1 5 8 0 0 

0.932