In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import ipdb

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])


In [3]:
X = X[permutation]
y = y[permutation].astype(np.int)
X = X.reshape((X.shape[0], -1))
print(y)

[0 4 1 ... 7 1 1]


In [4]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=60000, test_size=10000)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).T
X_test = scaler.transform(X_test).T

# plt.gray()
# plt.imshow(X_test[:, 50].reshape(28, 28))
# plt.show()
# X_train = X_train.T

In [5]:
X_train[:, 99]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -4.41807799e-03, -5.75481961e-03, -4.08251693e-03, -4.08251693e-03,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -4.08251693e-03, -5.74150335e-03, -8.30140398e-03, -1.15646815e-02,
       -1.51589738e-02, -1.94049683e-02, -2.48813400e-02, -3.09159998e-02,
       -3.26207722e-02, -3.32842765e-02, -3.34251756e-02, -3.00933581e-02,
       -3.05349307e-02, -2.80464045e-02, -2.38059959e-02, -1.92318114e-02,
       -1.64272691e-02, -1.09963601e-02, -8.32486080e-03, -4.38069356e-03,
        0.00000000e+00,  

In [6]:
def init_params():
    W1 = np.random.rand(64, 784) - 0.5 #randn is b/w -0.5 and 0.5
    b1 = np.random.rand(64, 1) - 0.5
    W2 = np.random.rand(10, 64) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

In [7]:
def ReLu(z):
    return np.maximum(0, z)
def deriv_ReLu(Z):
    return Z > 0
def softmax(Z):
    return np.exp(Z) / sum(np.exp(Z)) #collapses into one row for the sum

In [11]:
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + b2
    #ipdb.set_trace()
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2
def one_hot(Y):
    one_hot_Y = np.zeros((Y.max() + 1, Y.size)) #making a matrix of m x 10 here
    one_hot_Y[Y, np.arange(Y.size)] = 1 #going through all rows and setting the column w/ index corresponding to the y to 1, its very easy to iterate over numpy arays like this apparently
    return one_hot_Y
def back_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    ipdb.set_trace()
    m = Y.size
    one_hot_Y = one_hot(Y)  
    
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 /m * np.sum(dZ2, axis = 1, keepdims = True)
    
    dZ1 = W2.T.dot(dZ2) * deriv_ReLu(Z1)
    dW1 = 1 / m *dZ1.dot(X.T)
    db1 = 1/ m * np.sum(dZ1, axis = 1, keepdims = True)
    return dW1, db1, dW2, db2
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

In [12]:
def get_predictions(A2):
    return np.argmax(A2, 0)
def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size
def grad_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if (i%10 == 0):
            print("Iteration: ", i)
            print("Accuracy:", get_accuracy(get_predictions(A2), Y))
    return W1, b1, W2, b2

In [None]:
W1, b1, W2, b2 = grad_descent(X_train, y_train, 100, 0.1)

> [0;32m<ipython-input-11-5573489d12ae>[0m(14)[0;36mback_prop[0;34m()[0m
[0;32m     13 [0;31m    [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 14 [0;31m    [0mm[0m [0;34m=[0m [0mY[0m[0;34m.[0m[0msize[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m    [0mone_hot_Y[0m [0;34m=[0m [0mone_hot[0m[0;34m([0m[0mY[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m<ipython-input-11-5573489d12ae>[0m(15)[0;36mback_prop[0;34m()[0m
[0;32m     14 [0;31m    [0mm[0m [0;34m=[0m [0mY[0m[0;34m.[0m[0msize[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 15 [0;31m    [0mone_hot_Y[0m [0;34m=[0m [0mone_hot[0m[0;34m([0m[0mY[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m[0;34m[0m[0m
[0m
ipdb> n
> [0;32m<ipython-input-11-5573489d12ae>[0m(17)[0;36mback_prop[0;34m()[0m
[0;32m     16 [0;31m[0;34m[0m[0m
[0m[0;32m---> 17 [0;31m    [0mdZ2[0m [0;34

ipdb> n
> [0;32m<ipython-input-12-67372bd1ca21>[0m(12)[0;36mgrad_descent[0;34m()[0m
[0;32m     11 [0;31m        [0mW1[0m[0;34m,[0m [0mb1[0m[0;34m,[0m [0mW2[0m[0;34m,[0m [0mb2[0m [0;34m=[0m [0mupdate_params[0m[0;34m([0m[0mW1[0m[0;34m,[0m [0mb1[0m[0;34m,[0m [0mW2[0m[0;34m,[0m [0mb2[0m[0;34m,[0m [0mdW1[0m[0;34m,[0m [0mdb1[0m[0;34m,[0m [0mdW2[0m[0;34m,[0m [0mdb2[0m[0;34m,[0m [0malpha[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 12 [0;31m        [0;32mif[0m [0;34m([0m[0mi[0m[0;34m%[0m[0;36m10[0m [0;34m==[0m [0;36m0[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m            [0mprint[0m[0;34m([0m[0;34m"Iteration: "[0m[0;34m,[0m [0mi[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> W12
*** NameError: name 'W12' is not defined
ipdb> W2
array([[-2.50505115e-01, -1.92998816e-01, -2.06953772e-01,
         4.26339416e-01,  3.17130342e-01,  3.33379006e-01,
        -1.2

In [None]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

def test_prediction(index, W1, b1, W2, b2):
    current_image = X_test[:, index, None]
    prediction = make_predictions(X_test[:, index, None], W1, b1, W2, b2)
    label = y_test[index]
    print("Prediction: ", prediction)
    print("Label: ", label)
    
    current_image = current_image.reshape((28, 28))
    plt.gray()
    plt.imshow(current_image, interpolation='nearest')
    plt.show()

test_prediction(206, W1, b1, W2, b2)
test_prediction(106, W1, b1, W2, b2)
test_prediction(996, W1, b1, W2, b2)
test_prediction(2016, W1, b1, W2, b2)

In [None]:
Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X_test)
print("Accuracy:", get_accuracy(get_predictions(A2), y_test))