# 0-Import Libraries

In [402]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [403]:
data = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")

In [404]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 1-Converting to an array/shape it/shuffle data for cross validation

In [405]:
# To array
data = np.array(data)
print(data) 

[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [7 0 0 ... 0 0 0]
 [6 0 0 ... 0 0 0]
 [9 0 0 ... 0 0 0]]


In [406]:
#Get Shape 
m, n=data.shape
print(m,n) #4200 examples(m:rows), 785 pixels each(n:columns)

42000 785


In [407]:
#shuffle Data
np.random.shuffle(data) #prevents systematic bias from bad batching ---> a bad mix of digit ---> Could not generalize well
print(data)

[[9 0 0 ... 0 0 0]
 [6 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]]


# 2-Setting Training Data

In [408]:
# training data
train_data = np.array(data)#Convert to numpy array
np.random.shuffle(train_data)# Shuffle the data
train_data = train_data.T# Transpose the data(rows are collumns and collumns are rows)
Y_train = train_data[0]# Extract labels (first row: The actual numbers)
X_train = train_data[1:]# Extract features (remaining rows: The pixels)
X_train = X_train / 255# Normalize pixel values

In [409]:
print(Y_train)
Y_train.shape

[2 5 3 ... 5 1 3]


(42000,)

In [410]:
print(X_train) #the one left in 785 is the number to predict in the Y_train
X_train.shape

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


(784, 42000)

# 3-Initialization of Parameters

In [411]:
def init_parameters():
    #Random Weight and Biases to initialize
    #-0.5 center the random values around zero --> Generally leads to better convergence
    W1 = np.random.rand(10, 784) - 0.5 
    B1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    B2 = np.random.rand(10, 1) - 0.5
    return W1, B1, W2, B2

In [412]:
def ReLu(Z):
     return np.maximum(0, Z) #np.maximun exactly implements what ReLu is

In [413]:
def softmax(Z):
    #Implementing directly caused my code to crash, searched this on Google
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

In [414]:
def one_hot(Y):
    one_hot_Y=np.zeros((Y.size,Y.max()+1))
    one_hot_Y[np.arange(Y.size), Y]=1
    one_hot_Y=one_hot_Y.T
    return one_hot_Y

# 4-Propagation(Back and Forwards)

In [415]:
#Directly Implementation From notebook

#Forward Propagation
def Forward_Propagation(W1, B1, W2, B2, X):
    Z1 = W1.dot(X) + B1
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + B2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2


#BackPropagation
def Relu_derivative(Z):
    #Relu is just a 0 or 1 slope, so...:
    return Z > 0

def Back_Propagation(Z1, A1, Z2, A2, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot(Y)
    
    dZ2 = A2 - one_hot_Y
    dW2 = 1/m * dZ2.dot(A1.T)
    dB2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)
    
    dZ1 = W2.T.dot(dZ2) * Relu_derivative(Z1)
    dW1 = 1/m * dZ1.dot(X.T)
    dB1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)
    
    return dW1, dB1, dW2, dB2

# 5-Gradient Descent and Accuaracy Predictor

In [416]:
#Gradient Descent 
def update_Parameters(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha):
    W1 = W1 - alpha * dW1
    B1 = B1 - alpha * dB1
    W2 = W2 - alpha * dW2
    B2 = B2 - alpha * dB2
    return W1, B1, W2, B2

def get_predictions(A2):
    return np.argmax(A2,0)

def get_accuaracy(predictions,Y):
    print(predictions,Y)
    return np.sum(predictions==Y)/Y.size


def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions


def Gradient_descent(X, Y, iterations, alpha):
    W1, B1, W2, B2 = init_parameters()
    for i in range(iterations):
        Z1, A1, Z2, A2 = Forward_Propagation(W1, B1, W2, B2, X)
        dW1, dB1, dW2, dB2 = Back_Propagation(Z1, A1, Z2, A2, W2, X, Y)
        W1, B1, W2, B2 = update_Parameters(W1, B1, W2, B2, dW1, dB1, dW2, dB2, alpha)
        if i % 101 == 0:
            predictions = get_predictions(A2)
            accuracy = get_accuaracy(predictions, Y)
            print(f'Iteration: {i}')
            print(f'Accuracy: {accuracy:.4f}')
    return W1, B1, W2, B2

In [417]:
W1,b1,W2,b2=Gradient_descent(X_train,Y_train,1011,0.15)

[1 1 8 ... 6 8 1] [2 5 3 ... 5 1 3]
Iteration: 0
Accuracy: 0.1012
[2 5 5 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 101
Accuracy: 0.7628
[2 5 5 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 202
Accuracy: 0.8298
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 303
Accuracy: 0.8542
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 404
Accuracy: 0.8677
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 505
Accuracy: 0.8771
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 606
Accuracy: 0.8835
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 707
Accuracy: 0.8880
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 808
Accuracy: 0.8916
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 909
Accuracy: 0.8949
[2 5 3 ... 5 1 3] [2 5 3 ... 5 1 3]
Iteration: 1010
Accuracy: 0.8979
