# Homework

Task
* Find the hyperparameters that yield get the best predictions on the test dataset

Team
* Yufei Chu
* Kavish Vijay Daftari

In [1]:
import numpy as np
from urllib import request
import gzip
import pickle

In [2]:
filename = [
    ["training_images","train-images-idx3-ubyte.gz"],
    ["test_images","t10k-images-idx3-ubyte.gz"],
    ["training_labels","train-labels-idx1-ubyte.gz"],
    ["test_labels","t10k-labels-idx1-ubyte.gz"]
]

def download_mnist():
    base_url = "http://yann.lecun.com/exdb/mnist/"
    for name in filename:
        print("Downloading "+name[1]+"...")
        request.urlretrieve(base_url+name[1], name[1])
    print("Download complete.")

def save_mnist():
    mnist = {}
    for name in filename[:2]:
        with gzip.open(name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28*28)
    for name in filename[-2:]:
        with gzip.open(name[1], 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)
    with open("mnist.pkl", 'wb') as f:
        pickle.dump(mnist,f)
    print("Save complete.")

In [3]:
# download_mnist()
# save_mnist()

In [4]:
def load():
    with open(r"mnist.pkl",'rb') as f:
        mnist = pickle.load(f)
    return mnist["training_images"], mnist["training_labels"], mnist["test_images"], mnist["test_labels"]

In [5]:
X_train, y_train, X_test, y_test = load()

In [6]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((60000, 784), (60000,), (10000, 784), (10000,))

In [7]:
X_train.size

47040000

to see if there are any *non-zero* elements in our matrices, use `np.any()`, which returns True if it finds at least one non-zero element:

In [8]:
np.any(X_train), np.any(X_test)

(True, True)

The sparsity of our dataset:

In [9]:
np.count_nonzero(X_train, axis=None) / X_train.size, np.count_nonzero(X_test, axis=None) / X_test.size

(0.19120229591836735, 0.1927575255102041)

In [10]:
np.any(y_train), np.any(y_test)

(True, True)

In [11]:
y_train

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

This is how to prepare out data so that it is *normalized* and *batched*:

In [12]:
def norm(X, x_min, x_max):
    nom = (X - X.min(axis=0)) * (x_max-x_min)
    denom = X.max(axis=0) - X.min(axis=0)
    denom[denom==0] = 1
    return x_min + nom/denom 

In [13]:
def prep_data(X, y):

    X_ = []
    y_ = []

    itr = int(len(y) / batch_size) + 1
    for j in range(1, itr):
        rng = j * batch_size
        X_.append(X[rng - batch_size : rng, :])
        y_.append(y[rng - batch_size : rng])

    X, y = np.array(X_), np.array(y_)
    X = norm(X, 0, 1)

    return X, y

## Tuning the hyperpatameter learning rates to 0.01

Our hyperparameters: `eta` and `alpha` are the multiplicative and additive **learning rates**, used to put the brakes on *giant* weight updates (baby steps!). `n_iter` is the number of training epochs, and `batch_size` is the size of our minibatch:

In [14]:
eta=0.01
alpha=0.01
n_iter=100
batch_size=50

In [15]:
eta, alpha, n_iter, batch_size

(0.01, 0.01, 100, 50)

Let's prepare our data:

In [16]:
X_train, y_train = prep_data(X_train, y_train)

In [17]:
X_train.shape, y_train.shape

((1200, 50, 784), (1200, 50))

In [18]:
1200 * 50

60000

In [19]:
np.count_nonzero(X_train, axis=None) / X_train.size

0.19120229591836735

Let's one-hot encode our labels so that instead of many categories denoted by integers one through 10, we only have two categories: `0` and `1`, albeit in more dimensions.

In [20]:
def one_hot_enc(y, num_labels=10):
    one_hot = np.zeros((num_labels, y.shape[0]), dtype=np.float32)

    for i, val in enumerate(y):
        one_hot[val,i] = 1.0

    return one_hot

In [21]:
def one_hot_enc_v2(y, num_labels=10):
    one_hot = np.zeros((y.shape[0], num_labels), dtype=np.float32)

    for i, val in enumerate(y):
        one_hot[i,val] = 1.0

    return one_hot

Let's initialize our weights:

In [22]:
def init_weights(n_input, n_hidden_1, n_hidden_2, n_output, batch_size):
    #w1 = np.random.randn(n_hidden_1, n_input + 1)
    #w2 = np.random.randn(n_hidden_2, n_hidden_1 + 1)
    #w3 = np.random.randn(n_output, n_hidden_2 + 1)
    
    w1 = np.random.randn(n_input, n_hidden_1)
    w2 = np.random.randn(n_hidden_1, n_hidden_2)
    w3 = np.random.randn(n_hidden_2, n_output)
    
    return w1, w2, w3

Binary cross-entropy loss:

In [23]:
def compute_loss(prediction, label):
    term_1 = -1*label * np.log(prediction)
    term_2 = (1 - label) * (np.log(1 - prediction))

    loss = np.sum(term_1 - term_2)
    return loss

Initialize the number of neurons per layer:

In [24]:
n_hidden_1, n_hidden_2, n_output = 100, 100, 10
n_input = len(X_train[0,0,:]) #returns the flattened image size (28*28 = 784)
n_input

784

Initialize the weight1, weight2, weight3

In [25]:
w1, w2, w3 = init_weights(n_input, n_hidden_1, n_hidden_2, n_output, batch_size)

Initialize Markovian weight history, losses and accuracy per epoch:

In [26]:
delta_w1_prev = np.zeros(w1.shape)
delta_w2_prev = np.zeros(w2.shape)
delta_w3_prev = np.zeros(w3.shape)

train_losses = []
train_acc = []

Here is the full code for the **feedforward pass** (we return input and all outputs):

In [27]:
def compute_forward_pass(inputs):   
    a1 = inputs

    # outputs of first hidden layer
    z2 = np.matmul(a1, w1)
    a2 = 1/(1 + np.exp(-z2))

    # outputs of second hidden layer
    z3 = np.matmul(a2, w2)
    a3 = 1/(1 + np.exp(-z3))

    # outputs of final layer
    z4 = np.matmul(a3, w3)
    a4 = 1/(1 + np.exp(-z4))
    
    return a1, z2, a2, z3, a3, z4, a4

In **backpropagation**, we evaluate the gradients of the loss with respect to the weights of each layer. 

The **gradient descent algorithm** consists in subtracting the respective gradient from each layer's weights (subject to an additive and multiplicative "*brake*" in order to take baby steps in that direction) so that we may reach the point where we minimize the difference between labels and predictions.

<br />
<center>
<img src =ipynb.images/duck-baby-steps.png width = 400 />
</center>

## 2 Hidden layers equations
Our equations for two hidden layers are ($\hat{y}$ is the output of the network while $y$ is the label, $z$ is the output of hidden layer 2, $r$ is the output of hidden layer 1, and $x$ is the input):

$$\frac{\partial \;\text{Loss}(y, \hat{y})}{\partial W^3} = 
2(y - \hat{y}) * \sigma'(W^3 z) \otimes z$$

$$\frac{\partial \;\text{Loss}(y, \hat{y})}{\partial W^2} = 
2(y - \hat{y}) * \sigma'(W^3 z)  \otimes W^3 *
\sigma'(W^2 r) \otimes r$$

$$\frac{\partial \;\text{Loss}(y, \hat{y})}{\partial W^1} =
2(y - \hat{y}) * \sigma'(W^3 z) \otimes W^3 
* \sigma'(W^2 r) \otimes W^2 * \sigma'(W^1 x) \otimes x$$

`a` for activation: `a1` is the input data, `a2` is the output of hidden layer 1, `a3` is the output of hidden layer 2, `a4` is the output of the final layer.

`z` for linear data: `z1` is the input data, `z2` is the linear output of hidden layer 1 *before its activation function*, `z3` is the linear output of hidden layer 2 *before its activation function*, `z4` is the linear output of the final layer *before its activation function*.

In analogy with our `ml_dense_pruned` notebook, $\hat{y}$ is the output of the network while $y$ is the label, $z$ is the output of hidden layer 2, $r$ is the output of hidden layer 1, and $x$ is the input. In code, $z = $ `self.layer2` is the output of hidden layer 2, and $r = $ `self.layer1` is the output of hidden layer 1. 

$\sigma'(W^3 z) = $ `sigmoid_derivative_3`.

$\sigma'(W^2 r) = $ `sigmoid_derivative_2`.

$\sigma'(W^1 z) = $ `sigmoid_derivative_1`.

Here's our activation and its derivative:

In [28]:
def sigmoid(x):
    try:
        x = np.vectorize(round)(x)
        return 1.0/(1+ np.exp(-x))
    except:
        print(x)
        print(np.exp(-x))
        return 1.0/(1+ np.exp(-x))

In [29]:
def sigmoid_derivative(x):
    return sigmoid(x) * (1.0 - sigmoid(x))

Here's the backpropagation pass:

In [30]:
def compute_backward_pass_fullmath(outputs, label, verbose=False):
    a1, z2, a2, z3, a3, z4, a4 = outputs

    #
    # LAST LAYER
    #
    
    # from ml-dense-pruned:
    # sigmoid_derivative_3 = sigmoid_derivative(np.dot(self.layer2, self.weights3)) # s'(W3 z) derivative of final layer output
    # d_weights3 = np.dot(self.layer2.T, 
    #                        (2*(self.y - self.output) * sigmoid_derivative_3))
    sigmoid_derivative_3 = sigmoid_derivative(np.dot(a3, w3)) # s'(W3 z) derivative of final layer output [50, 10]
    if verbose: print("sigmoid_derivative_3:", sigmoid_derivative_3.shape, "should be [50, 10]")
    d_weights3 = np.dot(a3.T, 
                           (2 * (label - a4) * sigmoid_derivative_3)) # [50, 100].T * [50, 10]
    if verbose: print("d_weights3:", d_weights3.shape, "should be [100, 10]")
    
    
    # 
    # 2ND HIDDEN LAYER
    #

    # from ml-dense-pruned:
    # sigmoid_derivative_2 = sigmoid_derivative(np.dot(self.layer1, self.weights2)) # s'(r W2) derivative of hidden layer 2 out
    # d_weights2 = np.dot(self.layer1.T,  
    #                        np.dot(2*(self.y - self.output) * sigmoid_derivative_3, self.weights3.T) * 
    #                        sigmoid_derivative_2)
    sigmoid_derivative_2 = sigmoid_derivative(np.dot(a2, w2)) # s'(r W2) derivative of hidden layer 2 out
    if verbose: print("sigmoid_derivative_2:", sigmoid_derivative_2.shape, "should be [50, 100]")
    d_weights2 = np.dot(a2.T,  
                        np.dot((2 * (label - a4) * sigmoid_derivative_3), w3.T) * 
                        sigmoid_derivative_2) # [50, 100].T * ([50, 10] * [100, 10].T)[50, 100] = [50, 100].T * [50, 100] 
                                                                                                # = [100, 100]
    if verbose: print("d_weights2:", d_weights2.shape, "should be [100, 100]")
    
    
    # 
    # 1ST HIDDEN LAYER
    #
    
    # from ml-dense-pruned:
    # sigmoid_derivative_1 = sigmoid_derivative(np.dot(self.input, self.weights1))  # s'(x W1) derivative of hidden layer 1 out
    # d_weights1 = np.dot(self.input.T,  
    #                        np.dot(
    #                            np.dot(2*(self.y - self.output) * sigmoid_derivative_3, self.weights3.T) *
    #                                sigmoid_derivative_2, self.weights2.T) *   
    #                        sigmoid_derivative_1)
    sigmoid_derivative_1 = sigmoid_derivative(np.dot(a1, w1))  # s'(x W1) derivative of hidden layer 1 out [50, 100]
    if verbose: print("sigmoid_derivative_1:", sigmoid_derivative_1.shape, "should be [50, 100]")
    d_weights1 = np.dot(a1.T,  
                           np.dot(
                               np.dot((2 * (label - a4) * sigmoid_derivative_3), w3.T) *
                                   sigmoid_derivative_2, w2.T) *   
                           sigmoid_derivative_1) # [50, 784].T * (([50, 100] * [100, 100].T) . [50, 100]) * [100, 100].T)
                                                # = [50, 784].T * [50, 100] * [100, 100].T
                                                # = [50, 784].T * [50, 100].T
                                                # = [784, 100]
    if verbose: print("d_weights1:", d_weights1.shape, "should be [784, 100]")

    return d_weights1, d_weights2, d_weights3

Here's how we predict from the output of 10 neurons, representing the digit predicted in a one-hot encoded fashion:

In [31]:
def predict(a4):
    prediction = np.argmax(a4, axis=1)
    return prediction

The training dataset:

In [32]:
X_train.shape, y_train.shape

((1200, 50, 784), (1200, 50))

In [33]:
X_train[:, 0, ]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
np.any(X_train)

True

In [35]:
y_train

array([[5, 0, 4, ..., 5, 9, 3],
       [3, 0, 7, ..., 8, 3, 1],
       [5, 7, 1, ..., 6, 7, 3],
       ...,
       [5, 7, 4, ..., 2, 0, 9],
       [4, 0, 1, ..., 5, 9, 8],
       [8, 4, 0, ..., 5, 6, 8]], dtype=uint8)

This is how to iterate over the training dataset:

Let's debug *one batch* of one epoch:

## Set the hyperparameter learning rate = 0.01, epoch = 100:

In [36]:
eta=0.01
alpha=0.01
batch_size=50
n_iter = 100

In [37]:
from tqdm import tqdm
#epoch loop
for i in tqdm(range(n_iter)):
    # batch loop
    for (input, label) in zip(X_train, y_train):
        one_hot_label = one_hot_enc_v2(label, num_labels=10)

        a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(input)
        loss = compute_loss(a4, one_hot_label)
        grad1, grad2, grad3 = compute_backward_pass_fullmath([a1, z2, a2, z3, a3, z4, a4], one_hot_label)

        # multiplicative learning factor
        delta_w1, delta_w2, delta_w3 = eta * grad1, eta * grad2, eta * grad3

        # additive learning factor
        w1 = w1 + delta_w1 + delta_w1_prev * alpha
        w2 = w2 + delta_w2 + delta_w2_prev * alpha
        w3 = w3 + delta_w3 + delta_w3_prev * alpha

        delta_w1_prev, delta_w2_prev, delta_w3_prev = delta_w1, delta_w2, delta_w3

        train_losses.append(loss)
        predictions = predict(a4)

        wrong = np.where(predictions != label, np.matrix([1.]), np.matrix([0.]))
        accuracy = 1 - (np.sum(wrong) / batch_size)
        train_acc.append(accuracy)

    # epoch loss and accuracy (mean of al batches)
    print('epoch ', i, 'loss %.2f' % np.mean(np.matrix(train_losses)).item(), 
          'training accuracy %.2f' % np.mean(np.matrix(train_acc)).item())

  1%|▍                                          | 1/100 [00:08<13:51,  8.40s/it]

epoch  0 loss 254.18 training accuracy 0.47


  2%|▊                                          | 2/100 [00:16<13:20,  8.17s/it]

epoch  1 loss 199.00 training accuracy 0.59


  3%|█▎                                         | 3/100 [00:23<12:25,  7.68s/it]

epoch  2 loss 170.31 training accuracy 0.65


  4%|█▋                                         | 4/100 [00:32<13:07,  8.20s/it]

epoch  3 loss 154.15 training accuracy 0.69


  5%|██▏                                        | 5/100 [00:40<12:43,  8.03s/it]

epoch  4 loss 143.52 training accuracy 0.72


  6%|██▌                                        | 6/100 [00:50<13:35,  8.68s/it]

epoch  5 loss 135.84 training accuracy 0.73


  7%|███                                        | 7/100 [00:57<12:44,  8.22s/it]

epoch  6 loss 129.95 training accuracy 0.75


  8%|███▍                                       | 8/100 [01:06<13:03,  8.52s/it]

epoch  7 loss 125.16 training accuracy 0.76


  9%|███▊                                       | 9/100 [01:18<14:27,  9.53s/it]

epoch  8 loss 120.98 training accuracy 0.77


 10%|████▏                                     | 10/100 [01:26<13:32,  9.03s/it]

epoch  9 loss 116.42 training accuracy 0.78


 11%|████▌                                     | 11/100 [01:37<14:27,  9.74s/it]

epoch  10 loss 108.23 training accuracy 0.79


 12%|█████                                     | 12/100 [01:45<13:28,  9.19s/it]

epoch  11 loss 101.12 training accuracy 0.80


 13%|█████▍                                    | 13/100 [01:56<14:00,  9.66s/it]

epoch  12 loss 95.01 training accuracy 0.81


 14%|█████▉                                    | 14/100 [02:05<13:30,  9.43s/it]

epoch  13 loss 89.71 training accuracy 0.82


 15%|██████▎                                   | 15/100 [02:13<12:43,  8.99s/it]

epoch  14 loss 85.06 training accuracy 0.83


 16%|██████▋                                   | 16/100 [02:24<13:29,  9.63s/it]

epoch  15 loss 80.95 training accuracy 0.84


 17%|███████▏                                  | 17/100 [02:31<12:18,  8.89s/it]

epoch  16 loss 77.29 training accuracy 0.85


 18%|███████▌                                  | 18/100 [02:39<11:44,  8.59s/it]

epoch  17 loss 74.00 training accuracy 0.85


 19%|███████▉                                  | 19/100 [02:48<11:52,  8.80s/it]

epoch  18 loss 71.03 training accuracy 0.86


 20%|████████▍                                 | 20/100 [02:57<11:50,  8.88s/it]

epoch  19 loss 68.34 training accuracy 0.86


 21%|████████▊                                 | 21/100 [03:04<10:54,  8.28s/it]

epoch  20 loss 65.88 training accuracy 0.87


 22%|█████████▏                                | 22/100 [03:12<10:34,  8.14s/it]

epoch  21 loss 63.64 training accuracy 0.87


 23%|█████████▋                                | 23/100 [03:19<09:57,  7.76s/it]

epoch  22 loss 61.57 training accuracy 0.87


 24%|██████████                                | 24/100 [03:29<10:39,  8.42s/it]

epoch  23 loss 59.66 training accuracy 0.88


 25%|██████████▌                               | 25/100 [03:38<10:51,  8.68s/it]

epoch  24 loss 57.89 training accuracy 0.88


 26%|██████████▉                               | 26/100 [03:53<12:53, 10.46s/it]

epoch  25 loss 56.24 training accuracy 0.88


 27%|███████████▎                              | 27/100 [04:01<11:49,  9.72s/it]

epoch  26 loss 54.71 training accuracy 0.89


 28%|███████████▊                              | 28/100 [04:08<10:43,  8.94s/it]

epoch  27 loss 53.27 training accuracy 0.89


 29%|████████████▏                             | 29/100 [04:16<10:14,  8.65s/it]

epoch  28 loss 51.93 training accuracy 0.89


 30%|████████████▌                             | 30/100 [04:23<09:33,  8.19s/it]

epoch  29 loss 50.67 training accuracy 0.90


 31%|█████████████                             | 31/100 [04:31<09:16,  8.07s/it]

epoch  30 loss 49.48 training accuracy 0.90


 32%|█████████████▍                            | 32/100 [04:38<08:46,  7.75s/it]

epoch  31 loss 48.37 training accuracy 0.90


 33%|█████████████▊                            | 33/100 [04:46<08:51,  7.93s/it]

epoch  32 loss 47.31 training accuracy 0.90


 34%|██████████████▎                           | 34/100 [04:53<08:27,  7.69s/it]

epoch  33 loss 46.31 training accuracy 0.90


 35%|██████████████▋                           | 35/100 [05:01<08:27,  7.81s/it]

epoch  34 loss 45.36 training accuracy 0.91


 36%|███████████████                           | 36/100 [05:08<08:01,  7.53s/it]

epoch  35 loss 44.46 training accuracy 0.91


 37%|███████████████▌                          | 37/100 [05:16<07:58,  7.59s/it]

epoch  36 loss 43.60 training accuracy 0.91


 38%|███████████████▉                          | 38/100 [05:23<07:43,  7.47s/it]

epoch  37 loss 42.79 training accuracy 0.91


 39%|████████████████▍                         | 39/100 [05:30<07:26,  7.32s/it]

epoch  38 loss 42.01 training accuracy 0.91


 40%|████████████████▊                         | 40/100 [05:45<09:45,  9.75s/it]

epoch  39 loss 41.27 training accuracy 0.91


 41%|█████████████████▏                        | 41/100 [05:53<09:03,  9.20s/it]

epoch  40 loss 40.56 training accuracy 0.92


 42%|█████████████████▋                        | 42/100 [06:01<08:27,  8.74s/it]

epoch  41 loss 39.88 training accuracy 0.92


 43%|██████████████████                        | 43/100 [06:09<07:59,  8.42s/it]

epoch  42 loss 39.22 training accuracy 0.92


 44%|██████████████████▍                       | 44/100 [06:16<07:32,  8.09s/it]

epoch  43 loss 38.60 training accuracy 0.92


 45%|██████████████████▉                       | 45/100 [06:24<07:26,  8.11s/it]

epoch  44 loss 38.00 training accuracy 0.92


 46%|███████████████████▎                      | 46/100 [06:31<07:06,  7.89s/it]

epoch  45 loss 37.43 training accuracy 0.92


 47%|███████████████████▋                      | 47/100 [06:44<08:05,  9.15s/it]

epoch  46 loss 36.87 training accuracy 0.92


 48%|████████████████████▏                     | 48/100 [06:53<07:59,  9.22s/it]

epoch  47 loss 36.34 training accuracy 0.92


 49%|████████████████████▌                     | 49/100 [07:01<07:39,  9.01s/it]

epoch  48 loss 35.82 training accuracy 0.93


 50%|█████████████████████                     | 50/100 [07:11<07:34,  9.09s/it]

epoch  49 loss 35.33 training accuracy 0.93


 51%|█████████████████████▍                    | 51/100 [07:18<07:04,  8.67s/it]

epoch  50 loss 34.85 training accuracy 0.93


 52%|█████████████████████▊                    | 52/100 [07:27<06:54,  8.64s/it]

epoch  51 loss 34.39 training accuracy 0.93


 53%|██████████████████████▎                   | 53/100 [07:35<06:35,  8.41s/it]

epoch  52 loss 33.95 training accuracy 0.93


 54%|██████████████████████▋                   | 54/100 [07:44<06:34,  8.58s/it]

epoch  53 loss 33.51 training accuracy 0.93


 55%|███████████████████████                   | 55/100 [07:52<06:19,  8.43s/it]

epoch  54 loss 33.10 training accuracy 0.93


 56%|███████████████████████▌                  | 56/100 [08:02<06:27,  8.80s/it]

epoch  55 loss 32.70 training accuracy 0.93


 57%|███████████████████████▉                  | 57/100 [08:11<06:29,  9.06s/it]

epoch  56 loss 32.30 training accuracy 0.93


 58%|████████████████████████▎                 | 58/100 [08:21<06:34,  9.40s/it]

epoch  57 loss 31.93 training accuracy 0.93


 59%|████████████████████████▊                 | 59/100 [08:31<06:22,  9.32s/it]

epoch  58 loss 31.56 training accuracy 0.93


 60%|█████████████████████████▏                | 60/100 [08:39<06:06,  9.16s/it]

epoch  59 loss 31.20 training accuracy 0.94


 61%|█████████████████████████▌                | 61/100 [08:49<06:02,  9.30s/it]

epoch  60 loss 30.86 training accuracy 0.94


 62%|██████████████████████████                | 62/100 [08:58<05:45,  9.09s/it]

epoch  61 loss 30.52 training accuracy 0.94


 63%|██████████████████████████▍               | 63/100 [09:07<05:41,  9.23s/it]

epoch  62 loss 30.20 training accuracy 0.94


 64%|██████████████████████████▉               | 64/100 [09:16<05:25,  9.04s/it]

epoch  63 loss 29.88 training accuracy 0.94


 65%|███████████████████████████▎              | 65/100 [09:23<05:01,  8.61s/it]

epoch  64 loss 29.57 training accuracy 0.94


 66%|███████████████████████████▋              | 66/100 [09:32<04:51,  8.57s/it]

epoch  65 loss 29.28 training accuracy 0.94


 67%|████████████████████████████▏             | 67/100 [09:39<04:32,  8.26s/it]

epoch  66 loss 28.98 training accuracy 0.94


 68%|████████████████████████████▌             | 68/100 [09:47<04:16,  8.00s/it]

epoch  67 loss 28.70 training accuracy 0.94


 69%|████████████████████████████▉             | 69/100 [09:55<04:10,  8.08s/it]

epoch  68 loss 28.42 training accuracy 0.94


 70%|█████████████████████████████▍            | 70/100 [10:03<03:57,  7.91s/it]

epoch  69 loss 28.16 training accuracy 0.94


 71%|█████████████████████████████▊            | 71/100 [10:11<03:56,  8.16s/it]

epoch  70 loss 27.89 training accuracy 0.94


 72%|██████████████████████████████▏           | 72/100 [10:20<03:50,  8.24s/it]

epoch  71 loss 27.64 training accuracy 0.94


 73%|██████████████████████████████▋           | 73/100 [10:28<03:39,  8.13s/it]

epoch  72 loss 27.39 training accuracy 0.94


 74%|███████████████████████████████           | 74/100 [10:38<03:49,  8.82s/it]

epoch  73 loss 27.15 training accuracy 0.94


 75%|███████████████████████████████▌          | 75/100 [10:45<03:27,  8.29s/it]

epoch  74 loss 26.91 training accuracy 0.94


 76%|███████████████████████████████▉          | 76/100 [10:53<03:14,  8.10s/it]

epoch  75 loss 26.68 training accuracy 0.94


 77%|████████████████████████████████▎         | 77/100 [11:00<03:01,  7.89s/it]

epoch  76 loss 26.45 training accuracy 0.95


 78%|████████████████████████████████▊         | 78/100 [11:08<02:52,  7.83s/it]

epoch  77 loss 26.23 training accuracy 0.95


 79%|█████████████████████████████████▏        | 79/100 [11:15<02:39,  7.59s/it]

epoch  78 loss 26.01 training accuracy 0.95


 80%|█████████████████████████████████▌        | 80/100 [11:22<02:27,  7.37s/it]

epoch  79 loss 25.80 training accuracy 0.95


 81%|██████████████████████████████████        | 81/100 [11:30<02:24,  7.59s/it]

epoch  80 loss 25.59 training accuracy 0.95


 82%|██████████████████████████████████▍       | 82/100 [11:37<02:13,  7.40s/it]

epoch  81 loss 25.39 training accuracy 0.95


 83%|██████████████████████████████████▊       | 83/100 [11:45<02:08,  7.57s/it]

epoch  82 loss 25.19 training accuracy 0.95


 84%|███████████████████████████████████▎      | 84/100 [11:52<01:59,  7.45s/it]

epoch  83 loss 25.00 training accuracy 0.95


 85%|███████████████████████████████████▋      | 85/100 [12:00<01:52,  7.51s/it]

epoch  84 loss 24.81 training accuracy 0.95


 86%|████████████████████████████████████      | 86/100 [12:07<01:43,  7.36s/it]

epoch  85 loss 24.63 training accuracy 0.95


 87%|████████████████████████████████████▌     | 87/100 [12:14<01:36,  7.41s/it]

epoch  86 loss 24.45 training accuracy 0.95


 88%|████████████████████████████████████▉     | 88/100 [12:21<01:27,  7.28s/it]

epoch  87 loss 24.27 training accuracy 0.95


 89%|█████████████████████████████████████▍    | 89/100 [12:31<01:29,  8.12s/it]

epoch  88 loss 24.09 training accuracy 0.95


 90%|█████████████████████████████████████▊    | 90/100 [12:40<01:23,  8.30s/it]

epoch  89 loss 23.92 training accuracy 0.95


 91%|██████████████████████████████████████▏   | 91/100 [12:50<01:18,  8.73s/it]

epoch  90 loss 23.76 training accuracy 0.95


 92%|██████████████████████████████████████▋   | 92/100 [12:58<01:09,  8.69s/it]

epoch  91 loss 23.59 training accuracy 0.95


 93%|███████████████████████████████████████   | 93/100 [13:07<01:01,  8.78s/it]

epoch  92 loss 23.43 training accuracy 0.95


 94%|███████████████████████████████████████▍  | 94/100 [13:17<00:53,  8.98s/it]

epoch  93 loss 23.27 training accuracy 0.95


 95%|███████████████████████████████████████▉  | 95/100 [13:24<00:42,  8.45s/it]

epoch  94 loss 23.12 training accuracy 0.95


 96%|████████████████████████████████████████▎ | 96/100 [13:32<00:33,  8.35s/it]

epoch  95 loss 22.97 training accuracy 0.95


 97%|████████████████████████████████████████▋ | 97/100 [13:45<00:29,  9.68s/it]

epoch  96 loss 22.82 training accuracy 0.95


 98%|█████████████████████████████████████████▏| 98/100 [13:55<00:19,  9.82s/it]

epoch  97 loss 22.67 training accuracy 0.95


 99%|█████████████████████████████████████████▌| 99/100 [14:02<00:08,  8.96s/it]

epoch  98 loss 22.53 training accuracy 0.95


100%|█████████████████████████████████████████| 100/100 [14:10<00:00,  8.50s/it]

epoch  99 loss 22.39 training accuracy 0.95





The loss keeps *decreasing*, so that is encouraging. We need to run more epochs! We can also play with the learning rates!

Verify with the test set:

In [38]:
y_test

array([7, 2, 1, ..., 4, 5, 6], dtype=uint8)

In [39]:
y_test_one_hot = np.zeros((y_test.shape[0], 10), dtype=np.float32)

for i, val in enumerate(y_test):
    y_test_one_hot[i, val] = 1.0
    
print(y_test_one_hot)

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [40]:
X_test.shape, y_test.shape, y_test_one_hot.shape

((10000, 784), (10000,), (10000, 10))

To predict the test dataset, we run one feedforward pass with a batch of size 10,000!

In [41]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test)
loss = compute_loss(a4, y_test_one_hot)
predictions = predict(a4)
predictions

  a2 = 1/(1 + np.exp(-z2))


array([7, 2, 1, ..., 4, 5, 6])

In [42]:
labels = predict(y_test_one_hot)
labels

array([7, 2, 1, ..., 4, 5, 6])

In [43]:
wrong = np.where(predictions != labels, np.matrix([1.]), np.matrix([0.]))
test_accuracy = 1 - (np.sum(wrong) / 10000)
test_accuracy

0.9424

Not too shappby for 50 epochs!

Example:

In [44]:
X_test[:2,:].shape

(2, 784)

In [45]:
def predict(a4):
    prediction = np.argmax(a4, axis=1)
    return prediction

In [46]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test[:2,:])
loss = compute_loss(a4, y_test_one_hot[:2, :])
predictions = predict(a4)
loss, predictions

  a2 = 1/(1 + np.exp(-z2))


(0.2542736681510867, array([7, 2]))

In [47]:
a4

array([[7.21184931e-06, 2.65877941e-08, 4.28408664e-03, 7.60822316e-06,
        2.39273193e-06, 3.43692148e-02, 1.06031509e-10, 9.99724484e-01,
        8.25118043e-08, 2.18907307e-07],
       [4.53987347e-05, 3.22716277e-02, 9.99353384e-01, 3.87548625e-08,
        3.02631715e-10, 9.37871792e-07, 1.65745227e-01, 3.33755479e-12,
        7.49285221e-08, 6.88091527e-10]])

In [48]:
np.argmax(a4, axis=1)

array([7, 2])

In [49]:
y_test_one_hot[:2, :]

array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [50]:
predict(y_test_one_hot[:2, :])

array([7, 2])

In [51]:
wrong = np.where(predict(a4) != predict(y_test_one_hot[:2, :]), np.matrix([1.]), np.matrix([0.]))
wrong

array([[0., 0.]])

In [52]:
np.sum(wrong)

0.0

In [53]:
accuracy = 1 - (np.sum(wrong) / 2)
accuracy

1.0

## Set the hyperparameter learning rate = 0.001, epoch = 100:

In [55]:
eta=0.001
alpha=0.001
batch_size=50
n_iter = 100

### Reinitialize the parameter

Initialize the number of neurons per layer:

In [56]:
n_hidden_1, n_hidden_2, n_output = 100, 100, 10
n_input = len(X_train[0,0,:]) #returns the flattened image size (28*28 = 784)
n_input

784

Initialize the weight1, weight2, weight3

In [57]:
w1, w2, w3 = init_weights(n_input, n_hidden_1, n_hidden_2, n_output, batch_size)

Initialize Markovian weight history, losses and accuracy per epoch:

In [58]:
delta_w1_prev = np.zeros(w1.shape)
delta_w2_prev = np.zeros(w2.shape)
delta_w3_prev = np.zeros(w3.shape)

train_losses = []
train_acc = []

In [59]:
from tqdm import tqdm
#epoch loop
for i in tqdm(range(n_iter)):
    # batch loop
    for (input, label) in zip(X_train, y_train):
        one_hot_label = one_hot_enc_v2(label, num_labels=10)

        a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(input)
        loss = compute_loss(a4, one_hot_label)
        grad1, grad2, grad3 = compute_backward_pass_fullmath([a1, z2, a2, z3, a3, z4, a4], one_hot_label)

        # multiplicative learning factor
        delta_w1, delta_w2, delta_w3 = eta * grad1, eta * grad2, eta * grad3

        # additive learning factor
        w1 = w1 + delta_w1 + delta_w1_prev * alpha
        w2 = w2 + delta_w2 + delta_w2_prev * alpha
        w3 = w3 + delta_w3 + delta_w3_prev * alpha

        delta_w1_prev, delta_w2_prev, delta_w3_prev = delta_w1, delta_w2, delta_w3

        train_losses.append(loss)
        predictions = predict(a4)

        wrong = np.where(predictions != label, np.matrix([1.]), np.matrix([0.]))
        accuracy = 1 - (np.sum(wrong) / batch_size)
        train_acc.append(accuracy)

    # epoch loss and accuracy (mean of al batches)
    print('epoch ', i, 'loss %.2f' % np.mean(np.matrix(train_losses)).item(), 
          'training accuracy %.2f' % np.mean(np.matrix(train_acc)).item())

  1%|▍                                          | 1/100 [00:10<17:19, 10.50s/it]

epoch  0 loss 374.93 training accuracy 0.17


  2%|▊                                          | 2/100 [00:18<14:50,  9.09s/it]

epoch  1 loss 340.18 training accuracy 0.23


  3%|█▎                                         | 3/100 [00:26<13:34,  8.40s/it]

epoch  2 loss 308.41 training accuracy 0.28


  4%|█▋                                         | 4/100 [00:33<13:04,  8.17s/it]

epoch  3 loss 281.04 training accuracy 0.34


  5%|██▏                                        | 5/100 [00:41<12:19,  7.79s/it]

epoch  4 loss 260.36 training accuracy 0.38


  6%|██▌                                        | 6/100 [00:48<12:12,  7.80s/it]

epoch  5 loss 243.97 training accuracy 0.41


  7%|███                                        | 7/100 [00:56<11:45,  7.59s/it]

epoch  6 loss 230.26 training accuracy 0.44


  8%|███▍                                       | 8/100 [01:03<11:21,  7.40s/it]

epoch  7 loss 218.21 training accuracy 0.47


  9%|███▊                                       | 9/100 [01:10<11:16,  7.43s/it]

epoch  8 loss 207.36 training accuracy 0.49


 10%|████▏                                     | 10/100 [01:17<11:02,  7.36s/it]

epoch  9 loss 197.57 training accuracy 0.51


 11%|████▌                                     | 11/100 [01:25<11:11,  7.54s/it]

epoch  10 loss 188.68 training accuracy 0.53


 12%|█████                                     | 12/100 [01:32<10:52,  7.41s/it]

epoch  11 loss 180.57 training accuracy 0.55


 13%|█████▍                                    | 13/100 [01:40<10:54,  7.53s/it]

epoch  12 loss 173.19 training accuracy 0.57


 14%|█████▉                                    | 14/100 [01:47<10:33,  7.36s/it]

epoch  13 loss 166.47 training accuracy 0.58


 15%|██████▎                                   | 15/100 [01:55<10:33,  7.46s/it]

epoch  14 loss 160.30 training accuracy 0.60


 16%|██████▋                                   | 16/100 [02:02<10:12,  7.29s/it]

epoch  15 loss 154.62 training accuracy 0.61


 17%|███████▏                                  | 17/100 [02:09<10:12,  7.38s/it]

epoch  16 loss 149.32 training accuracy 0.62


 18%|███████▌                                  | 18/100 [02:16<09:58,  7.30s/it]

epoch  17 loss 144.34 training accuracy 0.63


 19%|███████▉                                  | 19/100 [02:24<10:05,  7.48s/it]

epoch  18 loss 139.65 training accuracy 0.64


 20%|████████▍                                 | 20/100 [02:31<09:49,  7.37s/it]

epoch  19 loss 135.27 training accuracy 0.65


 21%|████████▊                                 | 21/100 [02:39<09:37,  7.31s/it]

epoch  20 loss 131.19 training accuracy 0.66


 22%|█████████▏                                | 22/100 [02:46<09:43,  7.48s/it]

epoch  21 loss 127.40 training accuracy 0.67


 23%|█████████▋                                | 23/100 [02:54<09:29,  7.40s/it]

epoch  22 loss 123.87 training accuracy 0.68


 24%|██████████                                | 24/100 [03:01<09:31,  7.51s/it]

epoch  23 loss 120.58 training accuracy 0.69


 25%|██████████▌                               | 25/100 [03:08<09:10,  7.34s/it]

epoch  24 loss 117.50 training accuracy 0.69


 26%|██████████▉                               | 26/100 [03:16<09:14,  7.50s/it]

epoch  25 loss 114.62 training accuracy 0.70


 27%|███████████▎                              | 27/100 [03:23<08:54,  7.32s/it]

epoch  26 loss 111.92 training accuracy 0.71


 28%|███████████▊                              | 28/100 [03:31<08:58,  7.48s/it]

epoch  27 loss 109.38 training accuracy 0.71


 29%|████████████▏                             | 29/100 [03:38<08:48,  7.44s/it]

epoch  28 loss 106.99 training accuracy 0.72


 30%|████████████▌                             | 30/100 [03:46<08:45,  7.50s/it]

epoch  29 loss 104.73 training accuracy 0.72


 31%|█████████████                             | 31/100 [03:53<08:27,  7.35s/it]

epoch  30 loss 102.60 training accuracy 0.73


 32%|█████████████▍                            | 32/100 [04:01<08:29,  7.49s/it]

epoch  31 loss 100.58 training accuracy 0.73


 33%|█████████████▊                            | 33/100 [04:08<08:08,  7.30s/it]

epoch  32 loss 98.66 training accuracy 0.74


 34%|██████████████▎                           | 34/100 [04:15<07:56,  7.22s/it]

epoch  33 loss 96.84 training accuracy 0.74


 35%|██████████████▋                           | 35/100 [04:22<07:59,  7.37s/it]

epoch  34 loss 95.12 training accuracy 0.75


 36%|███████████████                           | 36/100 [04:30<07:52,  7.38s/it]

epoch  35 loss 93.47 training accuracy 0.75


 37%|███████████████▌                          | 37/100 [04:38<08:05,  7.70s/it]

epoch  36 loss 91.90 training accuracy 0.76


 38%|███████████████▉                          | 38/100 [04:46<07:51,  7.60s/it]

epoch  37 loss 90.40 training accuracy 0.76


 39%|████████████████▍                         | 39/100 [04:54<07:57,  7.82s/it]

epoch  38 loss 88.97 training accuracy 0.76


 40%|████████████████▊                         | 40/100 [05:01<07:40,  7.68s/it]

epoch  39 loss 87.60 training accuracy 0.77


 41%|█████████████████▏                        | 41/100 [05:10<07:47,  7.92s/it]

epoch  40 loss 86.28 training accuracy 0.77


 42%|█████████████████▋                        | 42/100 [05:17<07:35,  7.85s/it]

epoch  41 loss 85.02 training accuracy 0.77


 43%|██████████████████                        | 43/100 [05:26<07:33,  7.96s/it]

epoch  42 loss 83.82 training accuracy 0.78


 44%|██████████████████▍                       | 44/100 [05:33<07:16,  7.80s/it]

epoch  43 loss 82.65 training accuracy 0.78


 45%|██████████████████▉                       | 45/100 [05:41<07:14,  7.89s/it]

epoch  44 loss 81.54 training accuracy 0.78


 46%|███████████████████▎                      | 46/100 [05:48<06:55,  7.69s/it]

epoch  45 loss 80.46 training accuracy 0.78


 47%|███████████████████▋                      | 47/100 [05:57<06:55,  7.83s/it]

epoch  46 loss 79.42 training accuracy 0.79


 48%|████████████████████▏                     | 48/100 [06:04<06:38,  7.66s/it]

epoch  47 loss 78.42 training accuracy 0.79


 49%|████████████████████▌                     | 49/100 [06:12<06:36,  7.77s/it]

epoch  48 loss 77.46 training accuracy 0.79


 50%|█████████████████████                     | 50/100 [06:19<06:19,  7.59s/it]

epoch  49 loss 76.53 training accuracy 0.79


 51%|█████████████████████▍                    | 51/100 [06:27<06:17,  7.71s/it]

epoch  50 loss 75.63 training accuracy 0.80


 52%|█████████████████████▊                    | 52/100 [06:34<06:03,  7.56s/it]

epoch  51 loss 74.76 training accuracy 0.80


 53%|██████████████████████▎                   | 53/100 [06:42<06:02,  7.70s/it]

epoch  52 loss 73.92 training accuracy 0.80


 54%|██████████████████████▋                   | 54/100 [06:50<05:47,  7.56s/it]

epoch  53 loss 73.10 training accuracy 0.80


 55%|███████████████████████                   | 55/100 [06:58<05:45,  7.68s/it]

epoch  54 loss 72.31 training accuracy 0.81


 56%|███████████████████████▌                  | 56/100 [07:05<05:31,  7.54s/it]

epoch  55 loss 71.54 training accuracy 0.81


 57%|███████████████████████▉                  | 57/100 [07:13<05:29,  7.67s/it]

epoch  56 loss 70.80 training accuracy 0.81


 58%|████████████████████████▎                 | 58/100 [07:20<05:19,  7.61s/it]

epoch  57 loss 70.08 training accuracy 0.81


 59%|████████████████████████▊                 | 59/100 [07:29<05:25,  7.94s/it]

epoch  58 loss 69.38 training accuracy 0.81


 60%|█████████████████████████▏                | 60/100 [07:37<05:17,  7.93s/it]

epoch  59 loss 68.70 training accuracy 0.81


 61%|█████████████████████████▌                | 61/100 [07:45<05:14,  8.07s/it]

epoch  60 loss 68.03 training accuracy 0.82


 62%|██████████████████████████                | 62/100 [07:53<04:58,  7.86s/it]

epoch  61 loss 67.39 training accuracy 0.82


 63%|██████████████████████████▍               | 63/100 [08:01<04:52,  7.91s/it]

epoch  62 loss 66.76 training accuracy 0.82


 64%|██████████████████████████▉               | 64/100 [08:08<04:35,  7.64s/it]

epoch  63 loss 66.15 training accuracy 0.82


 65%|███████████████████████████▎              | 65/100 [08:15<04:22,  7.51s/it]

epoch  64 loss 65.56 training accuracy 0.82


 66%|███████████████████████████▋              | 66/100 [08:23<04:18,  7.62s/it]

epoch  65 loss 64.98 training accuracy 0.82


 67%|████████████████████████████▏             | 67/100 [08:30<04:08,  7.52s/it]

epoch  66 loss 64.42 training accuracy 0.83


 68%|████████████████████████████▌             | 68/100 [08:39<04:17,  8.04s/it]

epoch  67 loss 63.87 training accuracy 0.83


 69%|████████████████████████████▉             | 69/100 [08:46<04:00,  7.77s/it]

epoch  68 loss 63.33 training accuracy 0.83


 70%|█████████████████████████████▍            | 70/100 [08:54<03:55,  7.85s/it]

epoch  69 loss 62.81 training accuracy 0.83


 71%|█████████████████████████████▊            | 71/100 [09:02<03:41,  7.65s/it]

epoch  70 loss 62.30 training accuracy 0.83


 72%|██████████████████████████████▏           | 72/100 [09:09<03:35,  7.70s/it]

epoch  71 loss 61.80 training accuracy 0.83


 73%|██████████████████████████████▋           | 73/100 [09:17<03:24,  7.58s/it]

epoch  72 loss 61.31 training accuracy 0.83


 74%|███████████████████████████████           | 74/100 [09:25<03:20,  7.71s/it]

epoch  73 loss 60.83 training accuracy 0.84


 75%|███████████████████████████████▌          | 75/100 [09:32<03:08,  7.53s/it]

epoch  74 loss 60.37 training accuracy 0.84


 76%|███████████████████████████████▉          | 76/100 [09:40<03:02,  7.61s/it]

epoch  75 loss 59.91 training accuracy 0.84


 77%|████████████████████████████████▎         | 77/100 [09:47<02:51,  7.47s/it]

epoch  76 loss 59.47 training accuracy 0.84


 78%|████████████████████████████████▊         | 78/100 [09:55<02:47,  7.63s/it]

epoch  77 loss 59.03 training accuracy 0.84


 79%|█████████████████████████████████▏        | 79/100 [10:02<02:40,  7.62s/it]

epoch  78 loss 58.61 training accuracy 0.84


 80%|█████████████████████████████████▌        | 80/100 [10:10<02:34,  7.74s/it]

epoch  79 loss 58.19 training accuracy 0.84


 81%|██████████████████████████████████        | 81/100 [10:18<02:24,  7.59s/it]

epoch  80 loss 57.78 training accuracy 0.84


 82%|██████████████████████████████████▍       | 82/100 [10:26<02:18,  7.70s/it]

epoch  81 loss 57.38 training accuracy 0.84


 83%|██████████████████████████████████▊       | 83/100 [10:33<02:08,  7.56s/it]

epoch  82 loss 56.99 training accuracy 0.85


 84%|███████████████████████████████████▎      | 84/100 [10:41<02:02,  7.67s/it]

epoch  83 loss 56.60 training accuracy 0.85


 85%|███████████████████████████████████▋      | 85/100 [10:48<01:52,  7.51s/it]

epoch  84 loss 56.23 training accuracy 0.85


 86%|████████████████████████████████████      | 86/100 [10:56<01:49,  7.80s/it]

epoch  85 loss 55.86 training accuracy 0.85


 87%|████████████████████████████████████▌     | 87/100 [11:04<01:39,  7.65s/it]

epoch  86 loss 55.49 training accuracy 0.85


 88%|████████████████████████████████████▉     | 88/100 [11:11<01:31,  7.64s/it]

epoch  87 loss 55.14 training accuracy 0.85


 89%|█████████████████████████████████████▍    | 89/100 [11:20<01:26,  7.88s/it]

epoch  88 loss 54.79 training accuracy 0.85


 90%|█████████████████████████████████████▊    | 90/100 [11:27<01:17,  7.71s/it]

epoch  89 loss 54.45 training accuracy 0.85


 91%|██████████████████████████████████████▏   | 91/100 [11:35<01:10,  7.83s/it]

epoch  90 loss 54.11 training accuracy 0.85


 92%|██████████████████████████████████████▋   | 92/100 [11:42<01:00,  7.61s/it]

epoch  91 loss 53.78 training accuracy 0.85


 93%|███████████████████████████████████████   | 93/100 [11:50<00:53,  7.71s/it]

epoch  92 loss 53.46 training accuracy 0.85


 94%|███████████████████████████████████████▍  | 94/100 [11:57<00:45,  7.56s/it]

epoch  93 loss 53.14 training accuracy 0.86


 95%|███████████████████████████████████████▉  | 95/100 [12:05<00:38,  7.69s/it]

epoch  94 loss 52.83 training accuracy 0.86


 96%|████████████████████████████████████████▎ | 96/100 [12:13<00:30,  7.56s/it]

epoch  95 loss 52.52 training accuracy 0.86


 97%|████████████████████████████████████████▋ | 97/100 [12:21<00:23,  7.70s/it]

epoch  96 loss 52.22 training accuracy 0.86


 98%|█████████████████████████████████████████▏| 98/100 [12:28<00:15,  7.59s/it]

epoch  97 loss 51.92 training accuracy 0.86


 99%|█████████████████████████████████████████▌| 99/100 [12:36<00:07,  7.62s/it]

epoch  98 loss 51.63 training accuracy 0.86


100%|█████████████████████████████████████████| 100/100 [12:43<00:00,  7.63s/it]

epoch  99 loss 51.34 training accuracy 0.86





In [60]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test)
loss = compute_loss(a4, y_test_one_hot)
predictions = predict(a4)
predictions

  a2 = 1/(1 + np.exp(-z2))


array([7, 2, 1, ..., 4, 5, 6])

In [61]:
labels = predict(y_test_one_hot)
labels

array([7, 2, 1, ..., 4, 5, 6])

In [62]:
wrong = np.where(predictions != labels, np.matrix([1.]), np.matrix([0.]))
test_accuracy = 1 - (np.sum(wrong) / 10000)
test_accuracy

0.9145

Not too shappby for 50 epochs!

Example:

In [63]:
X_test[:2,:].shape

(2, 784)

In [64]:
def predict(a4):
    prediction = np.argmax(a4, axis=1)
    return prediction

In [65]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test[:2,:])
loss = compute_loss(a4, y_test_one_hot[:2, :])
predictions = predict(a4)
loss, predictions

  a2 = 1/(1 + np.exp(-z2))


(0.4932320585247496, array([7, 2]))

In [66]:
a4

array([[5.04892679e-06, 1.20484504e-03, 7.35493303e-04, 2.95780685e-03,
        5.57131968e-07, 8.41581833e-03, 6.24249620e-06, 9.94835593e-01,
        2.75681249e-07, 2.16888253e-03],
       [3.42791970e-04, 6.88453783e-05, 6.55289350e-01, 1.64002806e-03,
        1.22315746e-05, 2.02573310e-03, 3.89327356e-02, 4.77283806e-06,
        5.80772658e-03, 2.04145067e-04]])

In [68]:
wrong = np.where(predict(a4) != predict(y_test_one_hot[:2, :]), np.matrix([1.]), np.matrix([0.]))
wrong

array([[0., 0.]])

In [69]:
np.sum(wrong)

0.0

In [70]:
accuracy = 1 - (np.sum(wrong) / 2)
accuracy

1.0

## Set the hyperparameter learning rate = 0.01, epoch = 50:

In [71]:
eta=0.01
alpha=0.01
batch_size=50
n_iter = 50

### Reinitialize the parameter

Initialize the number of neurons per layer:

In [72]:
n_hidden_1, n_hidden_2, n_output = 100, 100, 10
n_input = len(X_train[0,0,:]) #returns the flattened image size (28*28 = 784)
n_input

784

Initialize the weight1, weight2, weight3

In [73]:
w1, w2, w3 = init_weights(n_input, n_hidden_1, n_hidden_2, n_output, batch_size)

Initialize Markovian weight history, losses and accuracy per epoch:

In [74]:
delta_w1_prev = np.zeros(w1.shape)
delta_w2_prev = np.zeros(w2.shape)
delta_w3_prev = np.zeros(w3.shape)

train_losses = []
train_acc = []

In [75]:
from tqdm import tqdm
#epoch loop
for i in tqdm(range(n_iter)):
    # batch loop
    for (input, label) in zip(X_train, y_train):
        one_hot_label = one_hot_enc_v2(label, num_labels=10)

        a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(input)
        loss = compute_loss(a4, one_hot_label)
        grad1, grad2, grad3 = compute_backward_pass_fullmath([a1, z2, a2, z3, a3, z4, a4], one_hot_label)

        # multiplicative learning factor
        delta_w1, delta_w2, delta_w3 = eta * grad1, eta * grad2, eta * grad3

        # additive learning factor
        w1 = w1 + delta_w1 + delta_w1_prev * alpha
        w2 = w2 + delta_w2 + delta_w2_prev * alpha
        w3 = w3 + delta_w3 + delta_w3_prev * alpha

        delta_w1_prev, delta_w2_prev, delta_w3_prev = delta_w1, delta_w2, delta_w3

        train_losses.append(loss)
        predictions = predict(a4)

        wrong = np.where(predictions != label, np.matrix([1.]), np.matrix([0.]))
        accuracy = 1 - (np.sum(wrong) / batch_size)
        train_acc.append(accuracy)

    # epoch loss and accuracy (mean of al batches)
    print('epoch ', i, 'loss %.2f' % np.mean(np.matrix(train_losses)).item(), 
          'training accuracy %.2f' % np.mean(np.matrix(train_acc)).item())

  2%|▉                                           | 1/50 [00:09<07:37,  9.35s/it]

epoch  0 loss 288.46 training accuracy 0.43


  4%|█▊                                          | 2/50 [00:17<06:51,  8.58s/it]

epoch  1 loss 243.60 training accuracy 0.52


  6%|██▋                                         | 3/50 [00:25<06:39,  8.49s/it]

epoch  2 loss 209.08 training accuracy 0.58


  8%|███▌                                        | 4/50 [00:33<06:17,  8.20s/it]

epoch  3 loss 181.41 training accuracy 0.63


 10%|████▍                                       | 5/50 [00:41<06:12,  8.29s/it]

epoch  4 loss 163.63 training accuracy 0.67


 12%|█████▎                                      | 6/50 [00:49<05:59,  8.17s/it]

epoch  5 loss 151.21 training accuracy 0.69


 14%|██████▏                                     | 7/50 [00:58<05:54,  8.24s/it]

epoch  6 loss 142.02 training accuracy 0.71


 16%|███████                                     | 8/50 [01:06<05:46,  8.25s/it]

epoch  7 loss 134.91 training accuracy 0.73


 18%|███████▉                                    | 9/50 [01:15<05:42,  8.36s/it]

epoch  8 loss 129.26 training accuracy 0.74


 20%|████████▌                                  | 10/50 [01:23<05:28,  8.21s/it]

epoch  9 loss 124.62 training accuracy 0.75


 22%|█████████▍                                 | 11/50 [01:31<05:23,  8.29s/it]

epoch  10 loss 120.69 training accuracy 0.76


 24%|██████████▎                                | 12/50 [01:39<05:11,  8.21s/it]

epoch  11 loss 117.31 training accuracy 0.77


 26%|███████████▏                               | 13/50 [01:48<05:08,  8.34s/it]

epoch  12 loss 114.34 training accuracy 0.77


 28%|████████████                               | 14/50 [01:55<04:53,  8.16s/it]

epoch  13 loss 111.69 training accuracy 0.78


 30%|████████████▉                              | 15/50 [02:04<04:47,  8.22s/it]

epoch  14 loss 109.27 training accuracy 0.79


 32%|█████████████▊                             | 16/50 [02:11<04:33,  8.05s/it]

epoch  15 loss 107.03 training accuracy 0.79


 34%|██████████████▌                            | 17/50 [02:20<04:28,  8.14s/it]

epoch  16 loss 104.91 training accuracy 0.79


 36%|███████████████▍                           | 18/50 [02:27<04:14,  7.96s/it]

epoch  17 loss 102.87 training accuracy 0.80


 38%|████████████████▎                          | 19/50 [02:36<04:10,  8.08s/it]

epoch  18 loss 100.81 training accuracy 0.80


 40%|█████████████████▏                         | 20/50 [02:43<03:58,  7.94s/it]

epoch  19 loss 98.28 training accuracy 0.81


 42%|██████████████████                         | 21/50 [02:52<03:54,  8.08s/it]

epoch  20 loss 94.62 training accuracy 0.81


 44%|██████████████████▉                        | 22/50 [02:59<03:43,  7.98s/it]

epoch  21 loss 91.21 training accuracy 0.82


 46%|███████████████████▊                       | 23/50 [03:08<03:38,  8.11s/it]

epoch  22 loss 88.07 training accuracy 0.82


 48%|████████████████████▋                      | 24/50 [03:15<03:26,  7.95s/it]

epoch  23 loss 85.16 training accuracy 0.83


 50%|█████████████████████▌                     | 25/50 [03:24<03:21,  8.04s/it]

epoch  24 loss 82.47 training accuracy 0.83


 52%|██████████████████████▎                    | 26/50 [03:31<03:09,  7.88s/it]

epoch  25 loss 79.97 training accuracy 0.84


 54%|███████████████████████▏                   | 27/50 [03:40<03:06,  8.10s/it]

epoch  26 loss 77.65 training accuracy 0.84


 56%|████████████████████████                   | 28/50 [03:47<02:55,  7.98s/it]

epoch  27 loss 75.47 training accuracy 0.85


 58%|████████████████████████▉                  | 29/50 [03:56<02:49,  8.08s/it]

epoch  28 loss 73.44 training accuracy 0.85


 60%|█████████████████████████▊                 | 30/50 [04:03<02:38,  7.91s/it]

epoch  29 loss 71.53 training accuracy 0.86


 62%|██████████████████████████▋                | 31/50 [04:12<02:32,  8.04s/it]

epoch  30 loss 69.74 training accuracy 0.86


 64%|███████████████████████████▌               | 32/50 [04:19<02:22,  7.92s/it]

epoch  31 loss 68.05 training accuracy 0.86


 66%|████████████████████████████▍              | 33/50 [04:28<02:16,  8.01s/it]

epoch  32 loss 66.46 training accuracy 0.87


 68%|█████████████████████████████▏             | 34/50 [04:35<02:06,  7.93s/it]

epoch  33 loss 64.95 training accuracy 0.87


 70%|██████████████████████████████             | 35/50 [04:43<02:00,  8.02s/it]

epoch  34 loss 63.53 training accuracy 0.87


 72%|██████████████████████████████▉            | 36/50 [04:52<01:53,  8.07s/it]

epoch  35 loss 62.18 training accuracy 0.87


 74%|███████████████████████████████▊           | 37/50 [05:00<01:46,  8.17s/it]

epoch  36 loss 60.89 training accuracy 0.88


 76%|████████████████████████████████▋          | 38/50 [05:12<01:50,  9.22s/it]

epoch  37 loss 59.67 training accuracy 0.88


 78%|█████████████████████████████████▌         | 39/50 [05:20<01:37,  8.83s/it]

epoch  38 loss 58.51 training accuracy 0.88


 80%|██████████████████████████████████▍        | 40/50 [05:28<01:27,  8.72s/it]

epoch  39 loss 57.40 training accuracy 0.88


 82%|███████████████████████████████████▎       | 41/50 [05:36<01:16,  8.48s/it]

epoch  40 loss 56.35 training accuracy 0.89


 84%|████████████████████████████████████       | 42/50 [05:45<01:08,  8.53s/it]

epoch  41 loss 55.34 training accuracy 0.89


 86%|████████████████████████████████████▉      | 43/50 [05:53<01:00,  8.59s/it]

epoch  42 loss 54.37 training accuracy 0.89


 88%|█████████████████████████████████████▊     | 44/50 [06:02<00:51,  8.55s/it]

epoch  43 loss 53.44 training accuracy 0.89


 90%|██████████████████████████████████████▋    | 45/50 [06:10<00:42,  8.46s/it]

epoch  44 loss 52.55 training accuracy 0.89


 92%|███████████████████████████████████████▌   | 46/50 [06:19<00:34,  8.59s/it]

epoch  45 loss 51.70 training accuracy 0.89


 94%|████████████████████████████████████████▍  | 47/50 [06:27<00:25,  8.39s/it]

epoch  46 loss 50.88 training accuracy 0.90


 96%|█████████████████████████████████████████▎ | 48/50 [06:36<00:17,  8.52s/it]

epoch  47 loss 50.10 training accuracy 0.90


 98%|██████████████████████████████████████████▏| 49/50 [06:45<00:08,  8.71s/it]

epoch  48 loss 49.34 training accuracy 0.90


100%|███████████████████████████████████████████| 50/50 [06:54<00:00,  8.29s/it]

epoch  49 loss 48.61 training accuracy 0.90





In [76]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test)
loss = compute_loss(a4, y_test_one_hot)
predictions = predict(a4)
predictions

  a2 = 1/(1 + np.exp(-z2))


array([7, 2, 1, ..., 4, 5, 6])

In [77]:
labels = predict(y_test_one_hot)
labels

array([7, 2, 1, ..., 4, 5, 6])

In [78]:
wrong = np.where(predictions != labels, np.matrix([1.]), np.matrix([0.]))
test_accuracy = 1 - (np.sum(wrong) / 10000)
test_accuracy

0.9403

Not too shappby for 50 epochs!

Example:

In [79]:
X_test[:2,:].shape

(2, 784)

In [80]:
def predict(a4):
    prediction = np.argmax(a4, axis=1)
    return prediction

In [81]:
a1, z2, a2, z3, a3, z4, a4 = compute_forward_pass(X_test[:2,:])
loss = compute_loss(a4, y_test_one_hot[:2, :])
predictions = predict(a4)
loss, predictions

  a2 = 1/(1 + np.exp(-z2))


(0.029139609454097014, array([7, 2]))

In [82]:
a4

array([[8.26013514e-06, 6.78718437e-06, 1.76458691e-04, 1.89373528e-04,
        1.83611373e-05, 4.52088906e-07, 2.36175365e-09, 9.99233208e-01,
        4.51706784e-05, 3.94189537e-04],
       [9.77413075e-03, 2.46625585e-03, 9.87265544e-01, 1.30686239e-04,
        3.16643906e-08, 5.65229540e-04, 1.04856925e-03, 1.10475893e-08,
        6.02294275e-05, 6.19937518e-04]])

In [83]:
wrong = np.where(predict(a4) != predict(y_test_one_hot[:2, :]), np.matrix([1.]), np.matrix([0.]))
wrong

array([[0., 0.]])

In [84]:
np.sum(wrong)

0.0

In [85]:
accuracy = 1 - (np.sum(wrong) / 2)
accuracy

1.0

## Conclusion

* For each result, we can see the hyperparameter with learning rate = 0.01, batch_size=50, epoch = 100 has the highest accuracy, the accuracy = 95% and the test accuracy = 100%