In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
import ipdb

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])


In [3]:
X = X[permutation]
y = y[permutation].astype(np.int)
X = X.reshape((X.shape[0], -1))
print(y)

[0 4 1 ... 7 1 1]


In [4]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=60000, test_size=10000)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).T
X_test = scaler.transform(X_test).T

# plt.gray()
# plt.imshow(X_test[:, 50].reshape(28, 28))
# plt.show()
# X_train = X_train.T

In [5]:
X_train[:, 99]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -4.41807799e-03, -5.75481961e-03, -4.08251693e-03, -4.08251693e-03,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -4.08251693e-03, -6.22758214e-03, -9.22759513e-03, -1.14055111e-02,
       -1.37738416e-02, -1.92915198e-02, -2.48398501e-02, -3.05603933e-02,
       -3.21923417e-02, -3.28766891e-02, -3.38292655e-02, -3.16914806e-02,
       -3.02095113e-02, -2.74643460e-02, -2.31361286e-02, -1.78810455e-02,
       -1.46259066e-02, -9.15862288e-03, -7.38446185e-03, -4.08251693e-03,
        0.00000000e+00,  

In [7]:
def init_params():
    W1 = np.random.rand(64, 784) - 0.5 #randn is b/w -0.5 and 0.5
    b1 = np.random.rand(64, 1) - 0.5
    W2 = np.random.rand(10, 64) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

In [8]:
def ReLu(z):
    return np.maximum(0, z)
def deriv_ReLu(Z):
    return Z > 0
def softmax(Z):
    return np.exp(Z) / sum(np.exp(Z)) #collapses into one row for the sum

In [24]:
def forward_prop(W1, b1, W2, b2, X):
    #ipdb.set_trace()
    Z1 = W1.dot(X) + b1
    A1 = ReLu(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2
def one_hot(Y):
    one_hot_Y = np.zeros((Y.max() + 1, Y.size)) #making a matrix of m x 10 here
    one_hot_Y[Y, np.arange(Y.size)] = 1 #going through all rows and setting the column w/ index corresponding to the y to 1, its very easy to iterate over numpy arays like this apparently
    return one_hot_Y
def back_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    #ipdb.set_trace()
    m = Y.size
    one_hot_Y = one_hot(Y)  
    
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 /m * np.sum(dZ2, axis = 1, keepdims = True)
    
    dZ1 = W2.T.dot(dZ2) * deriv_ReLu(Z1)
    dW1 = 1 / m *dZ1.dot(X.T)
    db1 = 1/ m * np.sum(dZ1, axis = 1, keepdims = True)
    return dW1, db1, dW2, db2
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

In [25]:
a = np.array([1, 2, 3, 4, 5, 6, 7])
b = one_hot(a)
a.size

7

In [26]:
def get_predictions(A2):
    return np.argmax(A2, 0)
def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size
def grad_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if (i%10 == 0):
            print("Iteration: ", i)
            print("Accuracy:", get_accuracy(get_predictions(A2), Y))
    return W1, b1, W2, b2

In [27]:
W1, b1, W2, b2 = grad_descent(X_train, y_train, 100, 0.1)

Iteration:  0
[0 4 8 ... 8 1 6] [5 1 2 ... 9 6 9]
Accuracy: 0.13758333333333334
Iteration:  10
[0 1 0 ... 8 1 7] [5 1 2 ... 9 6 9]
Accuracy: 0.60665
Iteration:  20
[5 1 3 ... 9 6 4] [5 1 2 ... 9 6 9]
Accuracy: 0.7137166666666667
Iteration:  30
[5 1 3 ... 9 6 4] [5 1 2 ... 9 6 9]
Accuracy: 0.7594166666666666
Iteration:  40
[5 1 3 ... 9 6 4] [5 1 2 ... 9 6 9]
Accuracy: 0.7860333333333334
Iteration:  50
[5 1 2 ... 9 6 4] [5 1 2 ... 9 6 9]
Accuracy: 0.8043666666666667
Iteration:  60
[5 1 2 ... 9 6 4] [5 1 2 ... 9 6 9]
Accuracy: 0.81755
Iteration:  70
[5 1 2 ... 9 6 4] [5 1 2 ... 9 6 9]
Accuracy: 0.8275166666666667
Iteration:  80
[5 1 2 ... 9 6 9] [5 1 2 ... 9 6 9]
Accuracy: 0.8357666666666667
Iteration:  90
[5 1 2 ... 9 6 9] [5 1 2 ... 9 6 9]
Accuracy: 0.8409166666666666


In [None]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

def test_prediction(index, W1, b1, W2, b2):
    current_image = X_test[:, index, None]
    prediction = make_predictions(X_test[:, index, None], W1, b1, W2, b2)
    label = y_test[index]
    print("Prediction: ", prediction)
    print("Label: ", label)
    
    current_image = current_image.reshape((28, 28))
    plt.gray()
    plt.imshow(current_image, interpolation='nearest')
    plt.show()

test_prediction(206, W1, b1, W2, b2)
test_prediction(106, W1, b1, W2, b2)
test_prediction(996, W1, b1, W2, b2)
test_prediction(2016, W1, b1, W2, b2)

In [None]:
Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X_test)
print("Accuracy:", get_accuracy(get_predictions(A2), y_test))