In [1]:
import numpy as np
import pandas as pd

$$
\begin{align*}
X &= \text{input}\\
Z_1 &= W_1 X + b_1\\
A_1 &= \text{ReLU}(Z_1)\\
Z_2 &= W_2 A_1 + b_2\\
A_2 &= \text{softmax}(Z_2)
\end{align*}
$$

In [268]:
def ReLU(X):
    return np.maximum(0, X)

def dReLU(X):
    return X > 0

def softmax(X):
    C = np.max(X, axis=0, keepdims=True) # subtract max for numerical stability
    e = np.exp(X-C)
    return e / np.sum(e, axis=0, keepdims=True)

def one_hot(Y):
    hot = np.zeros((Y.size, Y.max()+1))
    hot[np.arange(Y.size), Y] = 1
    return hot.T

def forward_propagate(W1, b1, W2, b2, X):
    Z1 = W1 @ X + b1
    A1 = ReLU(Z1)
    Z2 = W2 @ A1 + b2
    A2 = softmax(Z2)
    return Z1, A1, A2

def backward_propagate(W2, Z1, A1, A2, X, Y):
    m = Y.size
    Y = one_hot(Y)
    dZ2 = A2 - Y
    dZ1 = W2.T @ dZ2 * dReLU(Z1)
    dW1 = 1/m * dZ1 @ X.T
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)
    dW2 = 1/m * dZ2 @ A1.T
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

def initialize_parameters():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, lr):
    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2
    return W1, b1, W2, b2

In [269]:
def generate_data(train, test):
    train, test = np.array(train), np.array(test)
    np.random.shuffle(train)
    train, test = train.T, test.T
    train_labels, train_data = train[0], train[1:]
    test_labels, test_data = np.array(test)[0], np.array(test)[1:]
    train_data = train_data / 255.0
    test_data = test_data / 255.0
    return train_labels, train_data, test_labels, test_data

In [270]:
def get_predictions(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(Y_prediction, Y_true):
    return np.sum(Y_prediction == Y_true) / Y_true.size

def gradient_descent(X, Y, learning_rate, iterations):
    W1, b1, W2, b2 = initialize_parameters()
    for i in range(iterations):
        Z1, A1, A2 = forward_propagate(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = backward_propagate(W2, Z1, A1, A2, X, Y)
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, learning_rate)
        
        if i % 100 == 0:
            print(f"Iteration:\t{i}")
            predictions = get_predictions(A2)
            print(f"Accuracy:\t{get_accuracy(predictions, Y):.2%}\n")

    predictions = get_predictions(A2)
    print(f"Final Accuracy:\t{get_accuracy(predictions, Y):.2%}")

    return W1, b1, W2, b2


In [271]:
train = pd.read_csv("../data/mnist_train.csv")
test = pd.read_csv("../data/mnist_test.csv")

train_labels, train_data, test_labels, test_data = generate_data(train, test)
train_data, train_labels = train_data, train_labels

W1, b1, W2, b2 = gradient_descent(train_data, train_labels, 0.1, 500)

Iteration:	0
Accuracy:	10.54%

Iteration:	100
Accuracy:	66.70%

Iteration:	200
Accuracy:	76.85%

Iteration:	300
Accuracy:	80.96%

Iteration:	400
Accuracy:	83.26%

Final Accuracy:	84.67%


In [272]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, A2 = forward_propagate(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

In [273]:
predictions = make_predictions(test_data, W1, b1, W2, b2)
accuracy = get_accuracy(predictions, test_labels)
print(f"Accuracy on test data:\t{accuracy:.2%}")

Accuracy on test data:	85.48%
