In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

$$
\begin{align*}
x &= \text{input}\\
z_1 &= W_1 x + b_1\\
h &= \text{ReLU}(z_1)\\
z_2 &= W_2 h + b_2\\
\hat{y} &= \text{softmax}(z_2)\\
J &= \text{CE}(y, \hat{y})
\end{align*}
$$

In [3]:
def ReLU(x):
    return np.maximum(0, x)

def dReLU(x):
    return int(x > 0)

def softmax(x):
    e = np.exp(x)
    return e / np.sum(e)

def one_hot(y):
    hot = np.zeros((y.size, y.max()+1))
    hot[np.arange(y.size), y] = 1
    return hot.T

def forward_propagate(W1, b1, W2, b2, x):
    z1 = W1 @ x + b1
    h = ReLU(z1)
    z2 = W2 @ h + b2
    y = softmax(z2)
    return y

def backward_propagate(W2, h, x, y_pred, y_true):
    y_true = one_hot(y_true)
    delta_1 = y_pred - y_true
    delta_2 = delta_1 @ W2 * dReLU(h)
    dW1 = x @ delta_2
    db1 = delta_2
    dW2 = h @ delta_1
    db2 = delta_1
    return dW1, db1, dW2, db2

def initialize_parameters():
    W1 = np.randn(10, 784)
    b1 = np.randn(10, 1)
    W2 = np.randn(10, 10)
    b2 = np.randn(10, 1)
    return W1, b1, W2, b2

def update_parameters(W1, b1, W2, b2, dW1, db1, dW2, db2, lr):
    W1 -= lr * dW1
    b1 -= lr * db1
    W2 -= lr * dW2
    b2 -= lr * db2
    return W1, b1, W2, b2

In [6]:
train = pd.read_csv("../data/mnist_train.csv")
test = pd.read_csv("../data/mnist_test.csv")
test.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
