### In this notebook I implement a two layer neural netowrk from "scratch" using pandas and numpy and train it on the MNIST digit recognizer data set. 
This notebook aims to give a basic understanding of the underlying mathmatics that create these amazing technologies. The main components are 

Forward propagation (Input --> output)
Back propagation (Loss function, error minimization, gradient descent, partial derivatives with respect to weights and biases) 




In [190]:
# Libraries Imports

import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt

## Importing Data 

In [191]:
# Import data 
data = pd.read_csv('train.csv')

In [192]:
data.head(5)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
data.shape

(42000, 785)

In [194]:
# Visulizing the pixels and image

## Data Preprocessing 

In [195]:
# Changing Dataframe into a numpy array in order to perform all the matrix operations 
data = np.array(data)

In [196]:
# Getting dimensions 
m,n = data.shape

In [197]:
# m is the numbers of rows or images we have in this dataset 
m

42000

In [198]:
# n is the number columns (features) in this case pixels + 1 because of the index column (28x28) 784 + 1 
n

785

In [199]:
# Splitting the data into training data and testing data 

#Test data 
data_test = data[0:1000]

In [200]:
#Transposing the data, essentially making each column an image with the rows being each pixel for that picture. 
data_test = data_test.T
data_test.shape

(785, 1000)

In [201]:
#Initiating x and y test data 

In [202]:
# Ytest 
Y_test = data_test[0]
Y_test.shape

(1000,)

In [203]:
Y_test

array([1, 0, 1, 4, 0, 0, 7, 3, 5, 3, 8, 9, 1, 3, 3, 1, 2, 0, 7, 5, 8, 6,
       2, 0, 2, 3, 6, 9, 9, 7, 8, 9, 4, 9, 2, 1, 3, 1, 1, 4, 9, 1, 4, 4,
       2, 6, 3, 7, 7, 4, 7, 5, 1, 9, 0, 2, 2, 3, 9, 1, 1, 1, 5, 0, 6, 3,
       4, 8, 1, 0, 3, 9, 6, 2, 6, 4, 7, 1, 4, 1, 5, 4, 8, 9, 2, 9, 9, 8,
       9, 6, 3, 6, 4, 6, 2, 9, 1, 2, 0, 5, 9, 2, 7, 7, 2, 8, 8, 5, 0, 6,
       0, 0, 2, 9, 0, 4, 7, 7, 1, 5, 7, 9, 4, 6, 1, 5, 7, 6, 5, 0, 4, 8,
       7, 6, 1, 8, 7, 3, 7, 3, 1, 0, 3, 4, 5, 4, 0, 5, 4, 0, 3, 5, 1, 0,
       8, 3, 7, 0, 9, 6, 6, 9, 5, 4, 6, 9, 3, 5, 4, 2, 4, 8, 7, 7, 5, 8,
       8, 8, 2, 6, 9, 3, 1, 0, 4, 1, 5, 9, 0, 6, 2, 1, 3, 0, 6, 0, 0, 8,
       3, 2, 0, 0, 6, 0, 0, 4, 7, 2, 7, 1, 9, 9, 3, 9, 8, 4, 6, 6, 5, 3,
       8, 1, 8, 7, 1, 3, 7, 6, 3, 6, 3, 6, 3, 2, 3, 2, 2, 7, 9, 2, 3, 2,
       7, 5, 5, 8, 8, 2, 0, 1, 4, 0, 6, 3, 7, 1, 1, 1, 4, 7, 0, 2, 9, 2,
       0, 5, 6, 0, 8, 9, 6, 2, 0, 0, 7, 2, 0, 4, 2, 0, 9, 1, 6, 9, 3, 0,
       0, 2, 0, 6, 8, 4, 0, 7, 2, 1, 9, 5, 2, 4, 8,

In [204]:
# xtest 
X_test = data_test[1:]
X_test.shape

(784, 1000)

In [205]:
# Standaradize the pixel values by dividing by 255 (all pixels will be in a range now from 0 to 1) 
X_test = X_test/255

In [206]:
X_test.shape

(784, 1000)

In [207]:
#X_test[1:783, 1]

In [208]:
# Similarly same thing for training data 
data_train = data[1000:]

In [209]:
#Transposing the data
data_train = data_train.T
data_train.shape

(785, 41000)

In [210]:
#Split 
# Ytest 
Y_train = data_train[0]
Y_train.shape

(41000,)

In [211]:
# xtest 
X_train = data_train[1:]
X_train.shape

(784, 41000)

In [212]:
# Standaradize the pixel values by dividing by 255 (all pixels will be in a range now from 0 to 1) 
X_train = X_train/255

In [213]:
#X_train[1:783, 1]

## Defining functions for neural networks components: 
    
* Weights & Biases initilization 
* Forward propagation 
* Bakcward propagation 

In [214]:
## The process for a neural network is initialize parameters,forward propagation, 

In [215]:
## initialize parameters weight and biases 
def initial_parameters():
    w1 = np.random.rand(10,784) * np.sqrt(1/784)
    w2 = np.random.rand(10,10) * np.sqrt(1/10)
    b1 = np.random.rand(10,1) * np.sqrt(1/10)
    b2 = np.random.rand(10,1) * np.sqrt(1/10)
    return w1, w2, b1, b2

In [261]:
## Forward propagation 

# Define activiation functions for layer 1 and 2  
# I use the maximum function for this, and use 0 and z as inputs, the function takes in input z and returns either 0 if the value is smaller than 0 or z value if greater than zero


#Layer 1 Relu
def Relu_activation(z):
    return np.maximum(z,0)


# Derivative Of Relu (slope is 1 for positive values) so derviative of that is just 1, and slope for flat section is just zero 

def D_Relu(z):
    return z > 0 

#Layer 2 Softmax since its a classification problem, so want probabilites for each class. exponential of a particular class / sum of exponential of each class 
def softmax_stable(x):
    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())


# Forward Propagation 
def forward_propagation(w1, w2, b1, b2, x): 
    z1 = w1.dot(x) + b1 
    a1 = Relu_activation(z1)
    z2 = w2.dot(a1) + b2
    a2 = softmax_stable(z2)
    return z1, a1, z2, a2


# Back propagation 

# First have to one hot incode my y values 0-9 

# This function does two things 
# 1. creates a matrix of 0's with the Y dimensions Y.size is m (columns) and Y.max()+1 are the 10 possible outputs. To give an array with the shape (1000,10)
# 2. It indexes through one hot y using arrays 
# This allows it to for each row go to the column specified by the label in y and set it to one 
# Final step is to transpose the one hot incoded y so that each column is an example not each row 

def one_hot(Y):
    one_hot_y = np.zeros((Y.size, Y.max()+1))
    one_hot_y[np.arange(Y.size), Y] = 1
    one_hot_y = one_hot_y.T
    return one_hot_y

# Second we have to define the backpropagation derivatives. 
# The aim of back propagation is to find out the rate of change of the loss function (ΔLoss) with respect to the rate of change of weights and biases from each previous layer
# and find the most optimal weights and biases values that create the smallest loss using gradient decent. Learning rate is a tunning paramter that decides the step size at each 
# iteration while moving towards min of the loss function 


def back_propagation(z1,a1,z2,a2,w2,x,Y):
    m = Y.size
    one_hot_y = one_hot(Y) #10,m 
    dz2 = a2 - one_hot_y #10,10
    dw2 = 1/m * dz2.dot(a1.T)
    db2 = 1/m * np.sum(dz2) 
    dz1 = w2.T.dot(dz2) * D_Relu(z1)
    
    dw1 = 1/m * dz1.dot(x.T)
    db1 = 1/m * np.sum(dz1) 
    
    return dw1, dw2, db1, db2


# Updating the paramters function 

def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1    
    w2 = w2 - alpha * dw2  
    b2 = b2 - alpha * db2    
    return w1, b1, w2, b2


### Gradient Decent

In [273]:
# Gradient decent to optimize the loss 

""" This part is where I put all the functions I defined earlier together, the flow is as follows: 

* initialize paramters 
* loop for a specific number of iterations, in the loop 
    * forward propagate 
    * back propagate
    * update paramters
"""

# Get prediction function 
def get_predictions(a2):
    return np.argmax(a2, 0)

# Accuracy function 
def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size
    

def gradient_decent(x,Y,iterations,alpha):
    #Initial Paramters
    w1, w2, b1, b2 = initial_parameters()
    
    # Loop for a specific number of iterations 
    for i in range(iterations):
        z1, a1, z2, a2 = forward_propagation(w1, w2, b1, b2, x)
        dw1, dw2, db1, db2 = back_propagation(z1,a1,z2,a2,w2,x,Y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
        
        if i % 50 == 0:
            print("iteration:" , i) 
            predictions = get_predictions(a2)
            print(get_accuracy(predictions,Y))
    return w1, b1, w2, b2

In [None]:
w1, b1, w2, b2 = gradient_decent(X_train, Y_train, 500, 0.3)