In [126]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display, Markdown
data=np.array(pd.read_csv("images.csv"))
np.random.shuffle(data)
m,n=data.shape

#Extracting the labels
X_train=data[0:m-1000,1:].T
Y_train=data[0:m-1000,0]

X_test=data[m-1000:m,1:].T
Y_test=data[m-1000:m,0]

X_train=X_train/255
X_test=X_test/255

X_train.shape




(784, 41000)

#### The following is an implementation of a neural network with one hidden layer. This is meant to help get a better feel for how the differentiation works. All the formulas can be deduced from the math in the following markdown cell.

In [119]:
def init_params():
    W1 = np.random.rand(20, 784) - 0.5
    b1 = np.random.rand(20, 1) - 0.5
    W2 = np.random.rand(10, 20) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def reLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = reLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def reLU_deriv(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    one_hot_Y = one_hot(Y)
    #This is the derivative of cross-entropy. Implementing the actual loss function is only necessary for metrics.
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2,axis=1,keepdims=True)
    dZ1 = W2.T.dot(dZ2) * reLU_deriv(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1,axis=1,keepdims=True)

    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1    
    W2 = W2 - alpha * dW2  
    b2 = b2 - alpha * db2    
    return W1, b1, W2, b2            


In [120]:
def get_predictions(A):
    return np.argmax(A, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def grad_descent(X, Y, alpha, epochs):
    W1, b1, W2, b2 = init_params()
    for i in range(epochs):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Epoch: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, b1, W2, b2

W1, b1, W2, b2 = grad_descent(X_train, Y_train, 0.10, 500)

Epoch:  0
0.07563414634146341
Epoch:  10
0.2983170731707317
Epoch:  20
0.4287073170731707
Epoch:  30
0.5088292682926829
Epoch:  40
0.5664878048780487
Epoch:  50
0.6084878048780488
Epoch:  60
0.6422926829268293
Epoch:  70
0.6700487804878049
Epoch:  80
0.692560975609756
Epoch:  90
0.7105853658536585
Epoch:  100
0.725780487804878
Epoch:  110
0.7395121951219512
Epoch:  120
0.7498536585365854
Epoch:  130
0.7594146341463415
Epoch:  140
0.7686585365853659
Epoch:  150
0.7768780487804878
Epoch:  160
0.7827804878048781
Epoch:  170
0.7896585365853659
Epoch:  180
0.7958048780487805
Epoch:  190
0.801829268292683
Epoch:  200
0.806609756097561
Epoch:  210
0.8108536585365854
Epoch:  220
0.8150975609756097
Epoch:  230
0.8190731707317073
Epoch:  240
0.8227560975609756
Epoch:  250
0.8253414634146341
Epoch:  260
0.8284878048780487
Epoch:  270
0.8313170731707317
Epoch:  280
0.8335609756097561
Epoch:  290
0.8364146341463414
Epoch:  300
0.8390731707317073
Epoch:  310
0.8415853658536585
Epoch:  320
0.84414634

#### **This is a nice way of getting a better feel for how the derivatives work, but a generalizable implementation is needed.**



#### Here's the math for the backpropagation process. The idea is to basically view each layer as its own, optimizable unit which only needs the derivative of the previous layer to optimize the parameters

#### $\frac{\partial L}{\partial z_n} = A_n-Y$, 
##### where z_n represents the values passed to the activation in the last layer and A_n are the "activated" values

#### $ \frac{\partial L}{\partial z_{i-1}} = \frac{\partial L}{\partial z_i}*\frac{\partial z_i}{\partial z_{i-1}}$

#### $ \frac{\partial z_i}{\partial z_{i-1}} = w_i*\frac{\partial A_{i-1}}{\partial z_{i-1}} $

#### $ \frac{\partial L}{\partial w_i} = \frac{\partial L}{\partial z_i}*\frac{\partial z_i}{\partial w_i} = \frac{\partial L}{\partial z_i} * A_{i-1}^T $

#### $ \frac{\partial L}{\partial b_i} = \frac{\partial L}{\partial z_i}*\frac{\partial z_i}{\partial b_i} = {\frac{\partial L}{\partial z_{i}}}$


#### This is an inductive approach which makes it feasable to assemble multiple layers in a single network. The implementation is given below. I have also implemented the sigmoid function, but reLU works better here.

In [139]:

def cross_entropy(Z,Y):
    return -Z*np.log(Y)

def cross_entropy_deriv(A,Y):
    return A-Y

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def sigmoid_deriv(Z):
    return sigmoid(Z)*(1-sigmoid(Z))

derivatives={reLU:reLU_deriv, cross_entropy:cross_entropy_deriv,sigmoid:sigmoid_deriv}

class DenseLayer:
    def __init__(self,input_size,nr_neurons,activation,is_first):
        self.activation=activation
        self.weights=np.random.rand(nr_neurons,input_size)-0.5
        self.biases=np.random.rand(nr_neurons,1)-0.5
        self.is_first=is_first
        
    def forward_pass(self,input):
        return self.weights.dot(input)+self.biases, self.activation(self.weights.dot(input)+self.biases)
    
    def backprop(self,dz,z_value,activation,batch_size,learning_rate):
        dweights=dz.dot(activation.T)/batch_size
        dbiases=dz.sum(axis=1,keepdims=True)/batch_size
        self.weights=self.weights-learning_rate*dweights
        self.biases=self.biases-learning_rate*dbiases
        next_dz=self.weights.T.dot(dz)*derivatives[reLU](z_value)
        return next_dz
        
        
class NeuralNetwork:
    
    def __init__(self,loss=cross_entropy):
        self.layers=[]
        self.loss=loss
    
    def train(self,batch,labels,learning_rate):
        z_values=[]
        activation_values=[]
        temp_activations=batch
        #This is only here to make the dimensions match when calculating the dz derivative. The last dz is never used and 
        #its value refers to the actual input data, which obviously can't be optimized 
        z_values.append(batch)
        activation_values.append(batch)
        for layer in self.layers:
            z,a=layer.forward_pass(temp_activations)
            z_values.append(z)
            activation_values.append(a)
            temp_activations=a
        one_hot_y=one_hot(labels)
        dz=derivatives[self.loss](temp_activations,one_hot_y)
        z_values.pop()
        activation_values.pop()
        for i in range(len(z_values)-1,-1,-1):
            dz=self.layers[i].backprop(dz,z_values[i],activation_values[i],batch.shape[1],learning_rate)
            
    def add_layer(self,input_size,nr_neurons,activation):
        is_first= len(self.layers)==0
        self.layers.append(DenseLayer(input_size,nr_neurons,activation,is_first))
    
    def predict(self,input):
        temp=input
        for layer in self.layers:
            temp=layer.forward_pass(temp)[1]
        return np.argmax(temp,0)
    
EPOCHS=200
nn=NeuralNetwork()
nn.add_layer(784,30,reLU)
nn.add_layer(30,20,reLU)
nn.add_layer(20,10,softmax)
for i in range(EPOCHS):

    nn.train(X_train,Y_train,0.1)
    if i % 10 == 0:
        predictions=nn.predict(X_test)
        print("Epoch: ", i)
        print("Accuracy",get_accuracy(predictions,Y_test))  


Epoch:  0
Accuracy 0.085
Epoch:  10
Accuracy 0.221
Epoch:  20
Accuracy 0.361
Epoch:  30
Accuracy 0.447
Epoch:  40
Accuracy 0.506
Epoch:  50
Accuracy 0.577
Epoch:  60
Accuracy 0.614
Epoch:  70
Accuracy 0.647
Epoch:  80
Accuracy 0.672
Epoch:  90
Accuracy 0.708
Epoch:  100
Accuracy 0.723
Epoch:  110
Accuracy 0.738
Epoch:  120
Accuracy 0.751
Epoch:  130
Accuracy 0.763
Epoch:  140
Accuracy 0.776
Epoch:  150
Accuracy 0.791
Epoch:  160
Accuracy 0.797
Epoch:  170
Accuracy 0.804
Epoch:  180
Accuracy 0.81
Epoch:  190
Accuracy 0.813
