In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)
print(f"m = {m}, n = {n}")

m = 42000, n = 785


In [3]:
data_train = data[8400:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_, m_train = X_train.shape

In [4]:
Y_train

array([4, 3, 8, ..., 1, 1, 7])

In [5]:
def init_params():
    W1 = np.random.uniform(-0.3, 0.3, (32, 784))
    b1 = np.random.uniform(-0.3, 0.3, (32, 1))
    
    W2 = np.random.uniform(-0.3, 0.3, (32, 32))
    b2 = np.random.uniform(-0.3, 0.3, (32, 1))
                        
    W3 = np.random.uniform(-0.3, 0.3, (64, 32))
    b3 = np.random.uniform(-0.3, 0.3, (64, 1))

    W4 = np.random.uniform(-0.3, 0.3, (64, 64))
    b4 = np.random.uniform(-0.3, 0.3, (64, 1))

    W5 = np.random.uniform(-0.3, 0.3, (10, 64))
    b5 = np.random.uniform(-0.3, 0.3, (10, 1))

    return W1, b1, W2, b2, W3, b3, W4, b4, W5, b5

def ReLU(Z):
    return np.maximum(0, Z)

def sigmoid(Z):
    return 1/ (1 + np.exp(-Z))

def SeLU(Z, alpha=1.67326324, scale=1.05070098):
    return np.where(Z > 0, scale * Z, scale * alpha * (np.exp(Z) - 1))

def tanh(Z):
    return np.tanh(Z)

def softmax(Z):
    return np.exp(Z) / sum(np.exp(Z))

def ReLU_deriv(Z):
    return Z > 0

def sigmoid_deriv(Z):
    return sigmoid(Z) * (1 - sigmoid(Z))

def SeLU_deriv(Z, alpha=1.67326324, scale=1.05070098):
    return np.where(Z > 0, scale, scale * alpha * np.exp(Z))
    
def tanh_deriv(Z):
    return 1 - (tanh(Z) ** 2)

def one_hot(Y): #Make everything 0's except the most "valued" value!
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    
    return one_hot_Y

def forward_prop(X, W1, b1, W2, b2, W3, b3, W4, b4, W5, b5):
    Z1 = W1.dot(X) + b1 #(32, 784) * (784, m) + (32,1)
    A1 = ReLU(Z1) #(32, m) 
    
    Z2 = W2.dot(A1) + b2 #(32, 32) @ (32, m) + (32, 1)
    A2 = sigmoid(Z2) #(32, m)
    
    Z3 = W3.dot(A2) + b3 #(64, 32) @ (32, m) + (64, 1)
    A3 = SeLU(Z3) #(64, m)
    
    Z4 = W4.dot(A3) + b4 #(64, 64) @ (64, m) + (64, 1)
    A4 = tanh(Z4) #(64, m)
    
    Z5 = W5.dot(A4) + b5 #(10, 64) @ (64, m) + (10, 1)
    A5 = softmax(Z5) #(10, m)
    
    return Z1, A1, Z2, A2, Z3, A3, Z4, A4, Z5, A5

def back_prop(X, Y, W1, W2, W3, W4, W5, Z1, Z2, Z3, Z4, Z5, A1, A2, A3, A4, A5):
    one_hot_Y = one_hot(Y)
    
    dZ5 = A5 - one_hot_Y
    dW5 = 1 / m * dZ5.dot(A4.T)
    db5 = 1 / m * np.sum(dZ5, axis=1, keepdims=True)
    
    dZ4 = W5.T.dot(dZ5) * tanh_deriv(Z4)
    dW4 = 1 / m * dZ4.dot(A3.T)
    db4 = 1 / m * np.sum(dZ4, axis=1, keepdims=True)

    dZ3 = W4.T.dot(dZ4) * SeLU_deriv(Z3)
    dW3 = 1 / m * dZ3.dot(A2.T)
    db3 = 1 / m * np.sum(dZ3, axis=1, keepdims=True)

    dZ2 = W3.T.dot(dZ3) * sigmoid_deriv(Z2)
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2, dW3, db3, dW4, db4, dW5, db5

def update_params(W1, b1, W2, b2, W3, b3, W4, b4, W5, b5, dW1, db1, dW2, db2, dW3, db3, dW4, db4, dW5, db5, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    
    W3 = W3 - alpha * dW3
    b3 = b3 - alpha * db3
    
    W4 = W4 - alpha * dW4
    b4 = b4 - alpha * db4

    W5 = W5 - alpha * dW5
    b5 = b5 - alpha * db5

    return W1, b1, W2, b2, W3, b3, W4, b4, W5, b5



In [6]:
def get_prediction_A2(A2):
    return np.argmax(A2, 0)
def get_prediction_A3(A3):
    return np.argmax(A3, 0)
def get_prediction_A4(A4):
    return np.argmax(A4, 0)
def get_prediction_A5(A5):
    return np.argmax(A5, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2, W3, b3, W4, b4, W5, b5 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3, Z4, A4, Z5, A5 = forward_prop(X, W1, b1, W2, b2, W3, b3, W4, b4, W5, b5)
        dW1, db1, dW2, db2, dW3, db3, dW4, db4, dW5, db5 = back_prop(X, Y, W1, W2, W3, W4, W5, Z1, Z2, Z3, Z4, Z5, A1, A2, A3, A4, A5)
        W1, b1, W2, b2, W3, b3, W4, b4, W5, b5 = update_params(W1, b1, W2, b2, W3, b3, W4, b4, W5, b5, dW1, db1, dW2, db2, dW3, db3, dW4, db4, dW5, db5, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions_A2 = get_prediction_A2(A2)
            predictions_A3 = get_prediction_A3(A3)
            predictions_A4 = get_prediction_A4(A4)
            predictions_A5 = get_prediction_A5(A5)
            print(get_accuracy(predictions_A2, Y))
            print(get_accuracy(predictions_A3, Y))
            print(get_accuracy(predictions_A4, Y))
            print(get_accuracy(predictions_A5, Y))
    return W1, b1, W2, b2, W3, b3, W4, b4, W5, b5

In [7]:
W1, b1, W2, b2, W3, b3, W4, b4, W5, b5 = gradient_descent(X_train, Y_train, 0.10, 500)

Iteration:  0
[14 16 16 ...  8 16  8] [4 3 8 ... 1 1 7]
0.025863095238095237
[24 24 56 ... 24 56 24] [4 3 8 ... 1 1 7]
0.0
[31 31 11 ... 31 31 10] [4 3 8 ... 1 1 7]
8.928571428571429e-05
[6 5 5 ... 5 5 5] [4 3 8 ... 1 1 7]
0.08821428571428572
Iteration:  10
[14 16 16 ...  8 16  8] [4 3 8 ... 1 1 7]
0.035863095238095236
[24 24 56 ... 24 56 24] [4 3 8 ... 1 1 7]
0.0
[31 31 11 ... 31 31 10] [4 3 8 ... 1 1 7]
0.0002380952380952381
[6 0 1 ... 9 1 7] [4 3 8 ... 1 1 7]
0.22654761904761905
Iteration:  20
[14 16 16 ...  8 16  8] [4 3 8 ... 1 1 7]
0.03663690476190476
[24 24 24 ... 24 56 24] [4 3 8 ... 1 1 7]
0.0
[10 31 10 ... 31 31 10] [4 3 8 ... 1 1 7]
0.0009821428571428572
[6 0 3 ... 1 1 7] [4 3 8 ... 1 1 7]
0.3234821428571429
Iteration:  30
[14 16 16 ...  8 16  5] [4 3 8 ... 1 1 7]
0.03619047619047619
[24 24 24 ... 24 24 24] [4 3 8 ... 1 1 7]
0.0
[10 31 10 ... 31 31 10] [4 3 8 ... 1 1 7]
0.0034226190476190476
[6 0 0 ... 1 1 7] [4 3 8 ... 1 1 7]
0.3961904761904762
Iteration:  40
[14 16 16 ... 