In [80]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [81]:
data = pd.read_csv('train.csv')

In [82]:
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
#since we want to manipulate arrays lets convert everything into numpy arrays
data = np.array(data)

#set up some dev data to eliminate overfitting to actual data
m, n = data.shape
np.random.shuffle(data) #shuffle before splitting into dev and training set


data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.


data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_, m_train = X_train.shape

In [84]:
Y_train

array([0, 5, 7, ..., 2, 9, 3], dtype=int64)

In [85]:
def init_params():
    w1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    w2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2

In [92]:
def ReLU(z):
    return np.maximum(z, 0) #if each element in z is greater than z return that, if 0 then return 0

def softmax(z):
    var = np.exp(z) / sum(np.exp(z))
    return var
    

def forward_prop(w1, b1, w2, b2, X):
    z1 = w1.dot(X) + b1
    a1 = ReLU(z1)
    z2 = w2.dot(a1) + b2
    a2 = softmax(z2)
    return z1, a1, z2, a2
    
def deriv_relu(z):
    return z > 0 #when booleans are converted to num, true = 1 and false = 0]
# so if 1 elemetn in z is more than 0 return one otherwise 0

def one_hot(Y): #one hot encode Y
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1 #array from range 0 to m, labels set to 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(z1, a1, z2, a2, w1, w2, X, Y):
    one_hot_Y = one_hot(Y)
    dz2 = a2 - one_hot_Y
    dw2 = 1/m * dz2.dot(a1.T)
    db2 = 1/m * np.sum(dz2)
    dz1 = w2.T.dot(dz2) * deriv_relu(z1)
    dw1 = 1/m * dz1.dot(X.T)
    db1 = 1/m * np.sum(dz2)
    return dw1, db1, dw2, db2

In [93]:
def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, a):
    w1 = w1 - a*dw1
    b1 = b1 - a*db1
    w2 = w2 - a*dw2
    b2 = b2 - a*db2
    return w1, b1, w2, b2

In [94]:
def get_pred(a2):
    return np.argmax(a2, 0)

def get_acc(pred, Y):
    print(pred, Y)
    return np.sum(pred == Y) / Y.size

def grad_descent(X, Y, a, iterations):
    w1, b1, w2, b2 = init_params()
    for i in range(iterations):
        z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, X)
        dw1, db1, dw2, db2 = backward_prop(z1, a1, z2, a2, w1, w2, X, Y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, a)
        if i % 10 == 0:
            print("Iteration: ", i)
            pred = get_pred(a2)
            print(get_acc(pred, Y))
    return w1, b1, w2, b2

In [96]:
w1, b1, w2, b2 = grad_descent(X_train, Y_train, 0.10, 1000)

Iteration:  0
[2 2 2 ... 2 6 2] [0 5 7 ... 2 9 3]
0.09302439024390244
Iteration:  10
[3 8 8 ... 7 5 8] [0 5 7 ... 2 9 3]
0.21765853658536585
Iteration:  20
[3 5 7 ... 7 7 8] [0 5 7 ... 2 9 3]
0.3605121951219512
Iteration:  30
[3 3 7 ... 2 7 8] [0 5 7 ... 2 9 3]
0.457
Iteration:  40
[3 3 7 ... 2 7 3] [0 5 7 ... 2 9 3]
0.513609756097561
Iteration:  50
[3 3 7 ... 2 7 3] [0 5 7 ... 2 9 3]
0.5547560975609757
Iteration:  60
[3 3 7 ... 2 7 3] [0 5 7 ... 2 9 3]
0.5862926829268292
Iteration:  70
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.6109024390243902
Iteration:  80
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.6322926829268293
Iteration:  90
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.6510487804878049
Iteration:  100
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.667829268292683
Iteration:  110
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.6820731707317074
Iteration:  120
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.6953414634146341
Iteration:  130
[3 3 7 ... 2 9 3] [0 5 7 ... 2 9 3]
0.7075365853658536
Iteration:  140
[0 3 7 ... 2 9

In [97]:
def make_pred(X, w1, b1, w2, b2):
    _, _, _, a2 = forward_prop(w1, b1, w2, b2, X)
    pred = get_pred(a2)
    return pred
def test_pred(index, w1, b1, w2, b2):
    curr_img = X_train[:, index, None]
    pred = make_pred(X_train[:, index, None], w1, b1, w2, b2)
    label = Y_train[index]
    print("Prediction: ", pred)
    print("Label: ", label)
    
    curr_img = curr_img.reshape((28, 28)) * 255
    plt.gray()
    plt.imshow(curr_img, interpolation='nearest')
    plt.show()