## Writing a Python Program that Recognizes Images
### by Long Nguyen

We put together some of the functions implemented in the previous notebooks as well as finally implementing gradient descent. 

Feel free to copy and paste from your work on previous notebooks. 

In [8]:
from mnist_loader import load_data_wrapper
import numpy as np
import random
import matplotlib.pyplot as plt

In [9]:
import numpy as np

try:
    from jupyterthemes import jtplot
    jtplot.style()
except ImportError:
    pass

import matplotlib.pyplot as plt
%matplotlib notebook

In [10]:
from datasets import ToyData
from mygrad import Tensor
toy_data = ToyData()
from mygrad.nnet.layers import dense
from mygrad.nnet.activations import sigmoid

In [11]:
training_data, validation_data, test_data = load_data_wrapper()

In [12]:
def plot_images(images):
    "Plot a list of MNIST images."
    fig, axes = plt.subplots(nrows=1, ncols=len(images))
    for j, ax in enumerate(axes):
        ax.matshow(images[j][0].reshape(28,28), cmap = plt.cm.binary)
        ax.set_xticks([])
        ax.set_yticks([])
    plt.show()

#### Implement $\sigma(x)$. 

#### Implement the derivative of $\sigma$. (Hint: $\sigma'(x)=\sigma(x)(1-\sigma(x))$)

In [16]:
def sigmoid_prime(x):
    """Derivative of the sigmoid function."""
    return sigmoid(x)*(1-sigmoid(x))

#### Implement the score function.

In [13]:
def f(x, W1, W2, B1, B2):
    """Return the output of the network if ``x`` is input image and
    W1, W2, B1 and B2 are the learnable weights. """
    Z1 = dense(W1, x) + B1
    A1 = sigmoid(Z1)
    Z2 = dense(W2, A1) + B2
    A2 = sigmoid(Z2)
    return A2


In [18]:
def predict(images, W1, W2, B1, B2):
    predictions = []
    for im in images:
        a = f(img[0], W1, W2, B1, B2)
        predictions.append(np.argmax(a))
    return predictions

#### Implement vectorize_mini_batch.

In [14]:
def vectorize_mini_batch(mini_batch):
    """Given a minibatch of (image,lable) tuples of a certain size
    return the tuple X,Y where X contains all of the images and Y contains
    all of the labels stacked horizontally """
    mini_batch_x = []
    mini_batch_y = []
    for k in range(0,len(mini_batch)):
        mini_batch_x.append(mini_batch[k][0])
        mini_batch_y.append(mini_batch[k][1])
        
    X = Tensor(np.hstack(mini_batch_x))
    Y = Tensor(np.hstack(mini_batch_y))
    return X, Y   

Suppose we have an $L$-layer neural network. For an $m\times n$ matrix $A$, let i-th column of A be denoted by $A[i]$. 

Let $\cdot$ denote matrix multiplication and $\odot$ denote element-wise multiplication. 

These are the four equations of backpropagation. 

\begin{align}
\frac{\partial J}{\partial Z_L}&=\frac{1}{m}(A_L-Y)\odot\sigma'(Z_L)\\
\frac{\partial J}{\partial Z_i}&=\frac{1}{m}W_{i+1}^T\cdot \frac{\partial J}{\partial Z_{i+1}}\odot\sigma'(Z_i)\\
\frac{\partial J}{\partial W_i}
&=\frac{\partial J}{\partial Z_i}\cdot A_{i-1}^T\\
\frac{\partial J}{\partial B_i}
&=\frac{1}{m}\displaystyle\sum_i \frac{\partial J}{\partial Z_i}[i]
\end{align}

In [23]:
x = Tensor(np.array([[1,2],[3,4]]))
np.sum(np.array(x))

Tensor(10)

#### Implement gradient descent. 

In [34]:
def SGD(training_data, epochs, mini_batch_size, eta, test_data):
    """Gradient descent. 
    Epochs: the number of times the entire training_data is examined.
    mini_batch_size: the number of images used to approximate the gradient 
    each step of gradient descent.
    eta: the learning rate or the step size.
    test_data: check accuracy of the model against the test_data every epoch.
    """
    n = len(training_data)
    n_test = len(test_data)
    W1 = Tensor(np.random.randn(30, 784))
    W2 = Tensor(np.random.randn(10, 30))
    B1 = Tensor(np.random.randn(30, 1))
    B2 = Tensor(np.random.randn(10, 1))
    for j in range(epochs):
        random.shuffle(training_data)
        for k in range(0, n, mini_batch_size):
            mini_batch = training_data[k:k+mini_batch_size]
            X, Y = vectorize_mini_batch(mini_batch)
            #feed forward(vectorized)
            Z1 = dense(W1, X) + B1
            A1 = sigmoid(Z1) 
            Z2 = dense(W2, A1) + B2
            A2 = sigmoid(Z2)
                    
            loss = (1/(2*mini_batch_size)*(A2-Y)*(A2-Y)).sum()
            loss.backward()
            # update parameters by making a gradient step
            W2 = W2-eta*W2.grad
            W1 = W1-eta*W1.grad
            B2 = B2-eta*B2.grad
            B1 = B1-eta*B1.grad
            loss.null_gradients()
            
        # after every epoch, check the accuracy of the model    
        test_results = [(np.argmax(f(x, W1, W2, B1, B2)), y) for (x, y) in test_data]
        num_correct = sum(int(x == y) for (x, y) in test_results)
        print("Epoch {} : {} / {}".format(j, num_correct, n_test));
    return W1, B1, W2, B2


In [35]:
W1, B1, W2, B2 = SGD(training_data, 10, 10, 3, test_data)

RecursionError: maximum recursion depth exceeded