In [None]:
from pathlib import Path
import zipfile

In [None]:
# Creating path for data in the local system
data_path = Path("Data/digit-recognizer")
image_path = data_path / "digit_recognizer"

if image_path.is_dir():
  print(f"{image_path} directory already exists .... skipp creating one")
else:
  print(f"{image_path} does not exist, creating one...")
  image_path.mkdir(parents=True, exist_ok=True)
    
with zipfile.ZipFile(data_path / "digit-recognizer.zip", "r") as f:
    print("Unzipping the file...")
    f.extractall(image_path)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("Data/digit-recognizer/digit_recognizer/train.csv")

In [None]:
data.shape

# Splitting the data into validation and training data 


In [None]:
# convert the DataFrame into numpy array
data = np.array(data)
m, n = data.shape            # number of rows,m and columns,n 
# Shuffle before spliting 
np.random.shuffle(data)

data_val = data[0:1000].T   # transpose taken because the matrix multiplication requires the shape match 
Y_val = data_val[0]         # because the zeroth column is of label(image drawn by the user)
X_val = data_val[1:n]
"""
Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel,
with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.
"""
X_val = X_val/255           # each pixel has value between 0-255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train/255


In [None]:
X_train[:, 0].shape

The NN will have 3 layer Architecture. Input layer A[0] will have 784 units corresponding to 784 pixels n each 28*28 input image.
Two hidden layers will have 100 units and 10 units with ReLU activation within them for non linearity and to increase the complexity of the network .
Output layer will have 10 units corresponding to the ten digit classes with Softmax activation.

# Forward Activation:
      Z[1] = W[1]X + b[1]
      A[1] = gReLU(Z[1])
      Z[2] = W[2]A[1] + b[2]
      A[2] = gReLU(Z[2])
      Z[3] = W[3]A[2] + b[3]
      A[3] = gsoftmax(Z[3])
        
        
# Backward Activation:
      dZ[3] = Z[3] - Y                    # calculate the loss(difference between the predicted value and the real value)
      dW[3] = 1/m(dZ[3]A[2].T)
      dB[3] = 1/m(sum(dZ[3])
      dZ[2] = W[2].T.dZ[3].derivative_relu(A[2])
      dW[2] = 1/m(dZ[2]A[1].T)
      dB[2] = 1/m(sum(dZ[2]
      dZ[1] = W[1].T.dZ[2].derivative_relu(A[1])
      dW[2] = 1/m(dZ[1]A[0].T)
      dB[2] = 1/m(sum(dZ[1]
                      
# Parameter Update:
      W[3] = W[3] - alpha(dW[3])
      b[3] = b[3] - alpha(dB[3])
      W[2] = W[2] - alpha(dW[2])
      b[2] = b[2] - alpha(dB[3])
      W[1] = W[1] - alpha(dW[1])
      b[1] = b[1] - alpha(dB[3])
                      
# Variables and shapes :
                      A[0] = X: 785*m
                      Z[1] ~ A[1] = 100*m
                      W[1] : 100*785 (as W[1].A[0] = Z[1])
                      B[1] = 100*1
                      Z[2] ~ A[2] = 100*m
                      W[2] : 100*100 (as W[2].A[1] = Z[2])
                      B[2] = 100*1
                      Z[3] ~ A[3] = 10*m
                      W[3] : 10*100 (as W[3].A[2] = Z[1])
                      B[3] = 10*1
                      
    * Backprop:
                      dZ[3] = A[3] : 10*m
                      dW[3] = 10*10
                      dB[3] = 10*1
                      dZ[2] = A[2] : 100*m
                      dW[2] = 100*100
                      dB[2] = 100*1
                      dZ[1] = A[1] : 100*m
                      dW[3] = 100*784
                      dB[3] = 100*1
                      

In [None]:
# Defining the functions for every process

def init_params():
    W1 = np.random.rand(100,784) - 0.5
    b1 = np.random.rand(100,1) - 0.5
    W2 = np.random.rand(100,100) - 0.5
    b2 = np.random.rand(100,1) - 0.5
    W3 = np.random.rand(10,100) - 0.5
    b3 = np.random.rand(10,1) - 0.5
    return W1, b1, W2, b2, W3, b3

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    return np.exp(Z)/sum(np.exp(Z))

def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = ReLU(Z2)
    Z3 = W3.dot(A2) + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

def ReLU_deriv(Z):
    return Z > 0                     # logic behind this is the derivative of linear(positive part of ReLU is postive) and derivative of negative part is 0(as a straight line)

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))                  # Y is labels from 0 to 9 so 9+1 =10 the total number of classes and teh number of columns ,Y.size =1 so shape is 1*10
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y


def back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y):
    one_hot_Y = one_hot(Y)                # one hot the labels as to convert it into array of numbers
    """trying L2 loss in this network
     L2 loss = 1/m sum(y_true - y_pred)**2
     derivative of the loss here is 2/m(y_true - y_pred)
      did not work as L2 loss is mainly used for regression 
      for classification we acn use Cross Entropy loss : -1/m(sum(sum(y_true*log(y_pred))))
      and its derivative is y_true - y_pred 
     """
    dZ3 = A3 - one_hot_Y
    dW3 = 1/m * (dZ3.dot(A2.T))
    db3 = 1/m * (np.sum(dZ3))
    dZ2 = W3.T.dot(dZ3) * ReLU_deriv(Z2)
    dW2 = 1/m * (dZ2.dot(A1.T))
    db2 = 1/m * (np.sum(dZ2))
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1)
    dW1 = 1/m * (dZ1.dot(X.T))
    db1 = 1/m * (np.sum(dZ1))
    return dW1, db1, dW2, db2, dW3, db3

def update_params(dW1, db1, dW2, db2, dW3, db3, W1, b1, W2, b2, W3, b3, alpha):
    W1 = W1 - alpha*dW1
    b1 = b1 - alpha*db1
    W2 = W2 - alpha*dW2
    b2 = b2 - alpha*db2
    W3 = W3 - alpha*dW3
    b3 = b3 - alpha*db3
    return W1, b1, W2, b2, W3, b3

    
    
    

    

In [None]:
# Functions for Getting Predictions and Accuracy  
def get_predictions(A3):
    return np.argmax(A3, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions==Y) / Y.size


                                

In [None]:
# Defining Gradient Descent for Backpropagation

def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2, W3, b3 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)
        W1, b1, W2, b2, W3, b3 = update_params(dW1, db1, dW2, db2, dW3, db3, W1, b1, W2, b2, W3, b3, alpha)
        if i % 10 == 0:
            print(f"Iteration : {i}")
            predictions = get_predictions(A3)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2, W3, b3
        
        

In [None]:
# Unpacking the function
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, 200, 0.10)

In [None]:
# Functions for inference and testing 
def make_predictions(X, W1, b1, W2, b2, W3, b3):
    _, _, _, _, _, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
    predictions = get_predictions(A3)
    return predictions

def test_predictions(index, W1, b1, W2, b2, W3, b3):
    current_image = X_train[:, index, None]
    prediction = make_predictions(current_image, W1, b1, W2, b2, W3, b3)
    label = Y_train[index]
    print("Prediction: ", prediction)
    print("Label:", label)
    
    current_image = current_image.reshape((28,28))*255
    plt.gray()
    plt.imshow(current_image, interpolation="nearest")
    plt.show()
               


In [None]:
test_predictions(1, W1, b1, W2, b2, W3, b3)
test_predictions(145, W1, b1, W2, b2, W3, b3)
test_predictions(1000, W1, b1, W2, b2, W3, b3)
test_predictions(21000, W1, b1, W2, b2, W3, b3)

In [None]:
# Find the accuracy on validation set

val_predictions = make_predictions(X_val, W1, b1, W2, b2, W3, b3)
get_accuracy(val_predictions, Y_val)