In [2]:
import numpy as np
import scipy.optimize
from scipy.special import softmax

In [58]:
NUM_INPUT = 784  # Number of input neurons
NUM_HIDDEN = 50  # Number of hidden neurons
NUM_OUTPUT = 10  # Number of output neurons
NUM_CHECK = 5  # Number of examples on which to check the gradient

def unpack (w):  
    W1 = np.ndarray(shape=(NUM_HIDDEN,NUM_INPUT))
    b1 = np.ndarray(shape=(NUM_HIDDEN,1))
    W2 = np.ndarray(shape=(NUM_OUTPUT,NUM_HIDDEN))
    b2 = np.ndarray(shape=(NUM_OUTPUT,1))
    
    [w1, b1_f, w2, b2_f] = np.split(w, [W1.size, W1.size + b1.size, W1.size + b1.size + W2.size])
    
    W1_list = np.split(w1,NUM_HIDDEN)  
    for i in range(NUM_HIDDEN):
        W1[i] = W1_list[i]
        
    W2_list = np.split(w2,NUM_OUTPUT)
    for i in range(NUM_OUTPUT):
        W2[i] = W2_list[i]
    
    b1 = b1_f.reshape(len(b1_f),1)
    b2 = b2_f.reshape(len(b2_f),1)
    
    return W1, b1, W2, b2

def pack (W1, b1, W2, b2):
    w = W1[:,0]
    for i in range(W1.shape[1]-1):
        w = np.concatenate([w, W1[:,i+1]], axis = None)
    w = np.concatenate([w, b1], axis = None)
    for i in range(W2.shape[1]):
        w = np.concatenate([w, W2[:,i]], axis = None)
    w = np.concatenate([w, b2], axis = None)
    
    return w

def loadData (which):
    images = np.load("data/mnist_{}_images.npy".format(which))
    labels = np.load("data/mnist_{}_labels.npy".format(which))
    return images, labels

def fCE (X, Y, w, batch_size):
    W1, b1, W2, b2 = unpack(w)
    cost = 0.0
    z1_s = np.zeros(shape=(NUM_HIDDEN,batch_size))
    y_pred_s = np.zeros(shape=(NUM_OUTPUT,batch_size))
    
    for m in range(batch_size):
        z1 = np.dot(W1,X[m].reshape(len(X[m]),1)) + b1 
        z1_s[:,m] = z1[:,0]    
        h1 = np.maximum(0,z1) 
        z2 = np.dot(W2,h1) + b2 
        y_pred = softmax(z2)
        y_pred_s[:,m] = y_pred[:,0]
        
        cost -= np.dot(Y[m], np.log(y_pred))
    
    cost /= batch_size
        
    return z1_s, y_pred_s, cost

def gradCE (X, Y, w, batch_size):
    W1, b1, W2, b2 = unpack(w)
    z1_s, y_pred_s, __ = fCE(X, Y, w, batch_size)
     
    delta_z2_s = np.zeros(shape=(NUM_OUTPUT,batch_size))
    delta_z1_s = np.zeros(shape=(NUM_HIDDEN,batch_size))
    
    grad_W2 = np.zeros(shape=(NUM_OUTPUT,NUM_HIDDEN))
    grad_b2 = np.zeros(shape=(NUM_OUTPUT,1))
    
    grad_W1 = np.zeros(shape=(NUM_HIDDEN,NUM_INPUT))
    grad_b1 = np.zeros(shape=(NUM_HIDDEN,1))
    
    step = lambda x: 1.0 if x > 1.0e-32 else 0.0
    
    for m in range(batch_size):
        for k in range(NUM_OUTPUT):
            delta_z2_s[k][m] = y_pred_s[k][m] - Y[m][k]
    
    for m in range(batch_size):    
        for k in range(NUM_HIDDEN):
            delta_z1_s[k][m] = step(z1_s[k][m]) * np.dot(delta_z2_s[:,m], W2[:,k])
    
    for i in range(NUM_OUTPUT):
        for j in range(NUM_HIDDEN):
            for m in range(batch_size):      
                grad_W2[i][j] += delta_z2_s[i][m] * (np.maximum(0,z1_s[:,m]))[j]
                grad_b2[i] += delta_z2_s[i][m]
            grad_W2[i][j] /= batch_size
            grad_b2[i] /= batch_size
    
    for i in range(NUM_HIDDEN):
        for j in range(NUM_INPUT):
            for m in range(batch_size):
                grad_W1[i][j] += delta_z1_s[i][m] * X[m][j]
                grad_b1[i] += delta_z1_s[i][m]
            grad_W1[i][j] /= batch_size
            grad_b1[i] /= batch_size
    
    grad = pack(grad_W1.T, grad_b1, grad_W2.T, grad_b2)
  
    return grad

if __name__ == "__main__":
    # Load data
    trainX, trainY = loadData("train")
    testX, testY = loadData("test")

    print("len(trainX): ", len(trainX))
    print("len(testX): ", len(testX))

    # Initialize weights randomly
    W1 = 2*(np.random.random(size=(NUM_INPUT, NUM_HIDDEN))/NUM_INPUT**0.5) - 1./NUM_INPUT**0.5
    b1 = 0.01 * np.ones(NUM_HIDDEN)
    W2 = 2*(np.random.random(size=(NUM_HIDDEN, NUM_OUTPUT))/NUM_HIDDEN**0.5) - 1./NUM_HIDDEN**0.5
    b2 = 0.01 * np.ones(NUM_OUTPUT)
    w = pack(W1, b1.reshape(len(b1),1), W2, b2.reshape(len(b2),1))
    
    # Check that the gradient is correct on just a few examples (randomly drawn).
    idxs = np.random.permutation(trainX.shape[0])[0:NUM_CHECK]
    discrepancy = scipy.optimize.check_grad(lambda w_: fCE(np.atleast_2d(trainX[idxs,:]), np.atleast_2d(trainY[idxs,:]), w_, NUM_CHECK)[2], lambda w_: gradCE(np.atleast_2d(trainX[idxs,:]), np.atleast_2d(trainY[idxs,:]), w_, NUM_CHECK), w)
    if discrepancy < 0.01:
        print("My implemented cost and gradient functions are correct")

#     train_acc, test_acc = train(trainX, trainY, testX, testY, w)
#     print(train_acc,test_acc)

len(trainX):  10000
len(testX):  5000


In [59]:
discrepancy

0.11792891343254716

In [None]:
trainY[1]