In [13]:
import numpy as np
from tensorflow import keras #JUST FOR THE MNIST DATASET (And the one-hot encoding)

# Load Digit MNIST Dataset
mnist = keras.datasets.mnist

# Recover training & testing splits
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
print("Read in MNIST dataset")
print("X_train.shape: ", X_train.shape)
print("Y_train.shape: ", Y_train.shape)
print("X_test.shape: ", X_test.shape)
print("Y_test.shape: ", Y_test.shape)

# Flatten the training & testing data
print("")
print("FLATTEN")
X_train = X_train.reshape((60000, 28*28))
X_test = X_test.reshape((10000, 28*28))

#Transpose the images
X_train = X_train.T
X_test = X_test.T


print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)


# Normalize the training & testing data
X_train = X_train / 255
X_test = X_test / 255

m = 60000 # Number of training examples

Read in MNIST dataset
X_train.shape:  (60000, 28, 28)
Y_train.shape:  (60000,)
X_test.shape:  (10000, 28, 28)
Y_test.shape:  (10000,)

FLATTEN
X_train.shape:  (784, 60000)
X_test.shape:  (784, 10000)


### Backpropagation Calculations

We first define our variables and functions. 
- x: input vector 
- y: predicted output
- w1: weight vector between input and hidden layer
- w2: weight vector between hidden and output layer
- b1: bias vector for hidden layer
- b2: bias vector for output layer
- z1: weighted sum of input and hidden layer -> $z_1 = w_1^Tx + b_1$
- z2: weighted sum of hidden and output layer -> $z_2 = w_2^Ta_1 + b_2$
- a1: activation of hidden layer (ReLU) -> $a_1 = \begin{cases} z_1 & z_1 > 0 \\ 0 & z_1 \leq 0 \end{cases}$
- a2: activation of output layer (Softmax) -> $a_2 = \frac{e^{z_2}}{\sum_{i=1}^n e^{z_2}}$
- C0: cross-entropy loss -> $C_0 = -\sum_{i=1}^n y_i \log a_{2i}$

- $\frac{\partial C_0}{\partial z_2}$ #From Piazza @121_f1

$\frac{\partial C_0}{\partial z_2} = (a_2 - y)$
; $\frac{\partial z_2}{\partial w_2} = a_1$
; $\frac{\partial z_2}{\partial b_2} = 1$
; $\frac{\partial z_2}{\partial a_1} = w_2$
; $\frac{\partial a_1}{\partial z_1} = \begin{cases} 1 & z_1 > 0 \\ 0 & z_1 \leq 0 \end{cases}$
; $\frac{\partial z_1}{\partial w_1} = x$
; $\frac{\partial z_1}{\partial b_1} = 1$
; $\frac{\partial z_1}{\partial x} = w_1$


Where we update our weights and biases using gradient descent:

- $w_1 = w_1 - t \frac{\partial C_0}{\partial w_1}$
- $w_2 = w_2 - t \frac{\partial C_0}{\partial w_2}$
- $b_1 = b_1 - t \frac{\partial C_0}{\partial b_1}$
- $b_2 = b_2 - t \frac{\partial C_0}{\partial b_2}$

Where $t$ is the learning rate.


In [14]:
def initialWeights(neurons):
    # We subtract 0.5 to center the weights around 0

    # Input -> First Hidden Layer
    w1 = np.random.rand(neurons, 784) - 0.5 # 784 features
    b1 = np.random.rand(neurons, 1) - 0.5

    # First Hidden Layer -> Output
    w2 = np.random.rand(10, neurons) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2

def ReLU(x):
    # if x > 0:
    #     return x
    # else:
    #     return 0
    return np.maximum(0, x) #vectorized

def diffReLU(x):
    # if x > 0:
    #     return 1
    # else:
    #     return 0
    return x > 0 #vectorized

def softmax(x):
    return np.exp(x)/sum(np.exp(x))

def ForwardPass(x, w1, b1, w2, b2):
    z1 = np.dot(w1, x) + b1 # Weight matrix * input vector + bias vector
    a1 = ReLU(z1) # Apply activation function
    z2 = np.dot(w2, a1) + b2 # Weight matrix * input vector + bias vector
    a2 = softmax(z2) # Apply softmax
    return z1, a1, z2, a2 # Return all intermediate values for backprop & final output

def BackProp(x, y, z1, a1, z2, w2, a2):
    m = y.shape[0] # Number of training examples

    #One-hot encode the labels
    one_hot_Y =  keras.utils.to_categorical(y).T
   
    # dC0/dz2 = a2 - y
    dz2 = a2 - one_hot_Y
    
    # dC0/dw2 = dC0/dz2 * a1
    dw2 = np.dot(dz2, a1.T) / m

    # dC0/db2 = dC0/dz2 * 1
    db2 = np.sum(dz2) / m #average to match shape of bias

    # dC0/dz1 = dC0/dz2 * w2 * diffReLU(z1)
    dz1 = np.dot(w2.T, dz2) * diffReLU(z1)

    # dC0/dw1 = dC0/dz1 * x
    dw1 = np.dot(dz1,x.T) / m

    # dC0/db1 = dC0/dz1 * 1
    db1 = np.sum(dz1) / m #average to match shape of bias

    return dw1, db1, dw2, db2


def updateWeights(w1, b1, w2, b2, dw1, db1, dw2, db2, lr):
    # Apply gradient descent
    w1 = w1 - lr * dw1
    b1 = b1 - lr * db1
    w2 = w2 - lr * dw2
    b2 = b2 - lr * db2
    return w1, b1, w2, b2

def accuracy(a2, y):
    # number of correct predictions / total number of predictions
    return np.sum(np.argmax(a2, axis=0) == y) / y.size 

def fit(X, Y, iterations, lr):
    w1, b1, w2, b2 = initialWeights(neurons = 256)
    for i in range(1,iterations+1):
        z1, a1, z2, a2 = ForwardPass(X_train, w1, b1, w2, b2)
        dw1, db1, dw2, db2 = BackProp(X, Y, z1, a1, z2, w2, a2)
        w1, b1, w2, b2 = updateWeights(w1, b1, w2, b2, dw1, db1, dw2, db2, lr)
        if i % 25 == 0 or i == 1:
            print("iter", i)
            print("Accuracy:",accuracy(a2, Y))  
    return w1, b1, w2, b2   

In [15]:
parameters = fit(X_train, Y_train, 250, 0.5);

iter 1
Accuracy: 0.05705
iter 25
Accuracy: 0.5424333333333333
iter 50
Accuracy: 0.7780666666666667
iter 75
Accuracy: 0.8331166666666666
iter 100
Accuracy: 0.8582666666666666
iter 125
Accuracy: 0.8450666666666666
iter 150
Accuracy: 0.8846833333333334
iter 175
Accuracy: 0.8935333333333333
iter 200
Accuracy: 0.9004333333333333
iter 225
Accuracy: 0.9061
iter 250
Accuracy: 0.9107


In [16]:
# Evaluate the model
z1, a1, z2, a2 = ForwardPass(X_test, parameters[0], parameters[1], parameters[2], parameters[3])
print("Accuracy:",accuracy(a2, Y_test))

Accuracy: 0.9097
