In [21]:
import numpy as np
import time
import random

## Network architecture
NUM_INPUT = 784  # Number of input neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

## Hyperparameters
NUM_HIDDEN = 100
LEARNING_RATE = 0.15
BATCH_SIZE = 32
NUM_EPOCH = 50

print("NUM_HIDDEN: ", NUM_HIDDEN)
print("LEARNING_RATE: ", LEARNING_RATE)
print("BATCH_SIZE: ", BATCH_SIZE)
print("NUM_EPOCH: ", NUM_EPOCH)

# Given a vector w containing all the weights and biased vectors, extract
# and return the individual weights and biases W1, b1, W2, b2.
def unpack (w):
    W1 = np.reshape(w[:NUM_INPUT * NUM_HIDDEN],(NUM_INPUT,NUM_HIDDEN))
    w = w[NUM_INPUT * NUM_HIDDEN:]
    b1 = np.reshape(w[:NUM_HIDDEN], NUM_HIDDEN)
    w = w[NUM_HIDDEN:]
    W2 = np.reshape(w[:NUM_HIDDEN*NUM_OUTPUT], (NUM_HIDDEN,NUM_OUTPUT))
    w = w[NUM_HIDDEN*NUM_OUTPUT:]
    b2 = np.reshape(w,NUM_OUTPUT)
    return W1, b1, W2, b2

# Given individual weights and biases W1, b1, W2, b2, concatenate them and
# return a vector w containing all of them.
def pack (W1, b1, W2, b2):
    W1_ = np.reshape(W1,NUM_INPUT*NUM_HIDDEN)
    # print(W1_.shape)
    W2_ = np.reshape(W2,NUM_HIDDEN*NUM_OUTPUT)
    # print(W2_.shape)
    w = np.concatenate((W1_,b1, W2_, b2))
    # print(w.shape)
    return w

# Load the images and labels from a specified dataset (train or test).
def loadData (which):
    images = np.load("./data/mnist_{}_images.npy".format(which))
    labels = np.load("./data/mnist_{}_labels.npy".format(which))
    return images, labels

## 1. Forward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the cross-entropy (CE) loss.

def fCE (X, Y, w):
    # print(X.shape)
    W1, b1, W2, b2 = unpack(w)
    loss = 0.0
    for i in range(BATCH_SIZE):
        x = X[i]
        y = Y[i]
        z1 = np.dot(W1.T, x) + b1
        h1 = z1 * (z1 > 0)
        z2 = np.dot(W2.T, h1) + b2
        sigma = np.sum(np.exp(z2))
        log_softmax = z2 - np.log(sigma)
        CEL = np.sum(y * log_softmax)
        loss -= CEL
        
    loss /= BATCH_SIZE

    return loss

## 2. Backward Propagation
# Given training images X, associated labels Y, and a vector of combined weights
# and bias terms w, compute and return the gradient of fCE. 
def gradCE (X, Y, w):
    W1, b1, W2, b2 = unpack(w)
    
    delta_W_1 = np.zeros(np.shape(W1))
    delta_W_2 = np.zeros(np.shape(W2))
    delta_b_1 = np.zeros(np.shape(b1))
    delta_b_2 = np.zeros(np.shape(b2))
    
    for i in range(BATCH_SIZE):
        x = X[i]
        y = Y[i]
        
        z1 = np.dot(W1.T, x) + b1
        h1 = z1 * (z1 > 0)
        z2 = np.dot(W2.T, h1) + b2
        sigma = np.sum(np.exp(z2))
        log_softmax = z2 - np.log(sigma)
        yhat = np.exp(log_softmax)
        
        delta_W_2 += (yhat - y) * h1.reshape(-1, 1)
        delta_b_2 += yhat - y
        delta_W_1 += np.dot(W2, yhat - y) * (z1 > 0) * x.reshape(-1, 1)
        delta_b_1 += np.dot(W2, yhat - y) * (z1 > 0)
        
    delta_W_2 /= BATCH_SIZE
    delta_W_1 /= BATCH_SIZE
    delta_b_2 /= BATCH_SIZE
    delta_b_1 /= BATCH_SIZE
    
    delta = pack(delta_W_1, delta_b_1, delta_W_2, delta_b_2)
    return delta

## 3. Parameter Update
# Given training and testing datasets and an initial set of weights/biases,
# train the NN.
def train(trainX, trainY, testX, testY, w):
    num_iter = len(trainX) // BATCH_SIZE
    test_size = len(testX)
    train_size = len(trainX)
    indexes = list(range(len(trainX)))
    for i in range(NUM_EPOCH):
        print("\n", "epoch ", i)
        shuffled = random.shuffle(indexes)
        for iter in range(num_iter):
            index = indexes[iter * BATCH_SIZE : (iter + 1) * BATCH_SIZE]
            X = [trainX[t] for t in index]
            Y = [trainY[t] for t in index]
            
            delta = LEARNING_RATE * gradCE(X, Y, w)
            
            w -= delta
            if iter == num_iter - 1:
                print("loss: ", fCE(X, Y, w))
        
        # test
        correctness = 0
        W1, b1, W2, b2 = unpack(w)
        for j in range(train_size):
            x = trainX[j]
            y = trainY[j]
            z1 = np.dot(W1.T, x) + b1
            h1 = z1 * (z1 > 0)
            z2 = np.dot(W2.T, h1) + b2
            
            pred = np.argmax(z2)
            gt = np.argmax(y)
            if gt == pred:
                correctness += 1
        
        accuracy = correctness / train_size
        print("train accuracy: ", accuracy)
    print(" ")
    print("finish training")
    print("now test")
    correctness = 0
    W1, b1, W2, b2 = unpack(w)
    for j in range(test_size):
        x = testX[j]
        y = testY[j]
        z1 = np.dot(W1.T, x) + b1
        h1 = z1 * (z1 > 0)
        z2 = np.dot(W2.T, h1) + b2
            
        pred = np.argmax(z2)
        gt = np.argmax(y)
        if gt == pred:
            correctness += 1
        
    accuracy = correctness / test_size
    print("test accuracy: ", accuracy)
    

if __name__ == "__main__":
    # Load data
    start_time = time.time()
    trainX, trainY = loadData("train")
    testX, testY = loadData("test")

    print("len(trainX): ", len(trainX))
    print("len(testX): ", len(testX))

    # Initialize weights randomly
    W1 = 2*(np.random.random(size=(NUM_INPUT, NUM_HIDDEN))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
    b1 = 0.01 * np.ones(NUM_HIDDEN)
    W2 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_OUTPUT))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
    b2 = 0.01 * np.ones(NUM_OUTPUT)

    w = pack(W1, b1, W2, b2)
    print("Shape of w:",w.shape)
    print("highest train accuracy: 100%")
    print("highest test accuracy: 95.9%")

    # # Train the network and report the accuracy on the training and test set.
    train(trainX, trainY, testX, testY, w)

NUM_HIDDEN:  100
LEARNING_RATE:  0.15
BATCH_SIZE:  32
NUM_EPOCH:  50
len(trainX):  10000
len(testX):  5000
Shape of w: (79510,)
highest train accuracy: 100%
highest test accuracy: 95.9%

 epoch  0
loss:  0.11420638112597586
train accuracy:  0.9168

 epoch  1
loss:  0.03655858177959964
train accuracy:  0.9404

 epoch  2
loss:  0.11252349263896538
train accuracy:  0.9498

 epoch  3
loss:  0.11612966891595142
train accuracy:  0.9584

 epoch  4
loss:  0.12396142209524798
train accuracy:  0.9715

 epoch  5
loss:  0.024494354729535317
train accuracy:  0.9775

 epoch  6
loss:  0.0344315300947135
train accuracy:  0.984

 epoch  7
loss:  0.024707654496906956
train accuracy:  0.9865

 epoch  8
loss:  0.019841110461019634
train accuracy:  0.9887

 epoch  9
loss:  0.017350980932143018
train accuracy:  0.9925

 epoch  10
loss:  0.00702359804142097
train accuracy:  0.9947

 epoch  11
loss:  0.01972395556708975
train accuracy:  0.9962

 epoch  12
loss:  0.012217662924596223
train accuracy:  0.9966

 