In [34]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

### Load train dataset

In [24]:
data = pd.read_csv("../data/train.csv")

In [25]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle before splitting into training and dev sets

#### Dev set

In [26]:
data_dev = data[0:1000].T # T for transposing as each column is number features
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255. # scaling X between 0 and 1

#### Training set

In [27]:
data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_, m_train = X_train.shape

In [35]:
Y_train

array([0, 8, 7, ..., 4, 0, 5])

### Define our Neural Network

Our NN will have a simple two-layer architecture. Input layer a<sup>[0]</sup> will have 784 units corresponding to the 784 pixels in each 28x28 input image. A hidden layer  𝑎<sup>[1]</sup>
  will have 10 units with ReLU activation, and finally our output layer  𝑎<sup>[2]</sup>
  will have 10 units corresponding to the ten digit classes with softmax activation.

**Forward propagation**

$Z^{[1]} = W^{[1]}X + b^{[1]}$

$A^{[1]} = g_{ReLU}(Z^{[1]})$

$Z^{[2]} = W^{[2]}A^{[1]} + b^{[2]}$

$A^{[2]} = g_{softmax}(Z^{[2]})$

**Backward propagation**

$dZ^{[2]} = A^{[2]} - Y$

$dW^{[2]} = \frac{1}{m} dZ^{[2]} A^{[1]T}$

$dB^{[2]} = \frac{1}{m} \Sigma dZ^{[2]}$

$dZ^{[1]} = W^{[2]T} dZ^{[2]} \cdot* g^{[1]'}(z^{[1]})$

$dW^{[1]} = \frac{1}{m} dZ^{[1]} A^{[0]T}$

$dB^{[1]} = \frac{1}{m} \Sigma dZ^{[1]}$

**Parameter updates**

$W^{[2]} := W^{[2]} - \alpha dW^{[2]}$

$b^{[2]} := b^{[2]} - \alpha db^{[2]}$

$W^{[1]} := W^{[1]} - \alpha dW^{[1]}$

$b^{[1]} := b^{[1]} - \alpha db^{[1]}$

### Vars and shapes

**Forward prop**

- $A^{[0]} = X$: 784 x m

- $Z^{[1]} \sim A^{[1]}$: 10 x m

- $W^{[1]}$: 10 x 784 (as $W^{[1]}A^{[0]} \sim Z^{[1]}$)

- $B^{[1]}$: 10 x 1

- $Z^{[2]} \sim A^{[2]}$: 10 x m

- $W^{[2]}$: 10 x 10 (as $W^{[2]}A^{[1]} \sim Z^{[2]}$)

- $B^{[2]}$: 10 x 1

**Backprop**

- $dZ^{[2]}$: 10 x m ($A^{[2]}$)

- $dW^{[2]}$: 10 x 10

- $dB^{[2]}$: 10 x 1

- $dZ^{[1]}$: 10 x m ($A^{[1]}$)

- $dW^{[1]}$: 10 x 784

- $dB^{[1]}$: 10 x 1

In [38]:
def ini_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

In [39]:
def ReLU(Z):
    return np.maximum(Z, 0)

In [40]:
def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A

In [None]:
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2