In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import math
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


# Load the data
def loadData():
    with np.load("notMNIST.npz") as data:
        Data, Target = data["images"], data["labels"]
        np.random.seed(521)
        randIndx = np.arange(len(Data))
        np.random.shuffle(randIndx)
        Data = Data[randIndx] / 255.0
        Target = Target[randIndx]
        trainData, trainTarget = Data[:10000], Target[:10000]
        validData, validTarget = Data[10000:16000], Target[10000:16000]
        testData, testTarget = Data[16000:], Target[16000:]
    return trainData, validData, testData, trainTarget, validTarget, testTarget

# Implementation of a neural network using only Numpy - trained using gradient descent with momentum


def convertOneHot(trainTarget, validTarget, testTarget):
    newtrain = np.zeros((trainTarget.shape[0], 10))
    newvalid = np.zeros((validTarget.shape[0], 10))
    newtest = np.zeros((testTarget.shape[0], 10))

    for item in range(0, trainTarget.shape[0]):
        newtrain[item][trainTarget[item]] = 1
    for item in range(0, validTarget.shape[0]):
        newvalid[item][validTarget[item]] = 1
    for item in range(0, testTarget.shape[0]):
        newtest[item][testTarget[item]] = 1
    return newtrain, newvalid, newtest


def shuffle(trainData, trainTarget):
    np.random.seed(421)
    randIndx = np.arange(len(trainData))
    target = trainTarget
    np.random.shuffle(randIndx)
    data, target = trainData[randIndx], target[randIndx]
    return data, target


def relu(x):
    # TODO
    return np.maximum(x, 0)


def softmax(x):
    # TODO
    return np.exp(x)/np.sum(np.exp(x))


def computeLayer(X, W, b):
    # TODO
    return np.matmul(X, W)+b


def CE(target, prediction):
    # TODO
    score = softmax(prediction)
    ce = np.sum(np.multiply(target, np.log(score)), axis=1)
    loss = -np.mean(ce)
    return loss


def gradCE(target, prediction):
    # TODO
    N = target.shape[0]
    score = softmax(prediction)
    print ("score.shape: ", score.shape)
    print ("target.shape: ",target.shape)
    res = score - prediction
    return res


def error(target, prediction):
    return np.multiply((target-prediction),(target-prediction))


def derivation_LW(last_X, y, target):
    grad_CE = gradCE(target, y)
    res = 2*np.multiply((np.multiply((y-target), grad_CE)), last_X)
    return res

  from ._conv import register_converters as _register_converters


In [2]:
    trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
    train_y, valid_y, test_y = convertOneHot(trainTarget, validTarget, testTarget)


In [3]:
    s, l, h = trainData.shape #1000 samples, 28, 28
    F = l*h  #784 features
    c = 10  #10 classes
    xi = trainData.reshape(s, F)
    variance_h = 2/(F + s)
    variance_o = 2/(s + c)

    mean, stand_dev_h, stand_dev_o = 0, math.sqrt(variance_h), math.sqrt(variance_o), 

    Wh = np.random.normal(mean, stand_dev_h, (F, s)) #784,1000
    bh = np.zeros((1,s))                           #1, 10
    Wo = np.random.normal(mean, stand_dev_o, (s, c)) #1000, 10
    bo = np.zeros((1,c))                           #1, 10

In [6]:
    del_Wh = np.full((F, s), 1e-5)
    del_bh = np.full((1, s), 1e-5)
    del_Wo = np.full((s, c), 1e-5)
    del_bo = np.full((1,c), 1e-5)

    gamma = 0.99
    learning_rate = 0.01

In [5]:
        sh = computeLayer(xi, Wh, bh)
        xh = relu(sh)
        print ("xh's shape: ", xh.shape)
        so = computeLayer(xh, Wo, bo)
        yo = softmax(so)
        print ("yo's shape: ", yo.shape)

xh's shape:  (10000, 10000)
yo's shape:  (10000, 10)


In [7]:
grad_ce = gradCE(train_y,yo)

score.shape:  (10000, 10)
target.shape:  (10000, 10)


In [14]:
der_wo = np.dot(xh,grad_ce)

In [15]:
der_bo = np.dot(1,grad_ce)

In [20]:
der_wh = np.dot(grad_ce,np.transpose(Wo))

In [32]:
print(der_wh.shape)
print (sh.shape)

(10000, 10000)
(10000, 10000)


In [27]:
der_wh1 = np.where(sh>0,der_wh,0)

In [54]:
der_wh = np.dot(der_wh1,xi)
print (der_wh.shape)

(10000, 784)


In [55]:
print(der_wo.shape)
print(der_bo.shape)

(10000, 10)
(10000, 10)


In [60]:
    der_e_xib = np.dot(grad_ce,1)
    der_e_xib = np.where(sh>0,der_e_xib,0) #delta_h
    der_bh = np.dot(1,der_e_xib)
    print (der_bh.shape)

ValueError: operands could not be broadcast together with shapes (10000,10000) (10000,10) () 

In [48]:
def backPropagation(xi,sh,xh,so,wo,prediction,target):

    grad_ce = gradCE(target,prediction) #delta_o
    
    der_wo = np.dot(xh,grad_ce)
    der_bo = np.dot(1,grad_ce)

    der_e_xiw = np.dot(grad_ce,np.transpose(wo))
    der_e_xiw = np.where(sh>0,der_e_xiw,0)  #delta_h
    der_wh = np.dot(xi,der_e_xiw)

    der_e_xib = np.dot(grad_ce,1)
    der_e_xib = np.where(sh>0,der_e_xib,0) #delta_h
    der_bh = np.dot(1,der_e_xib)

    return der_wo, der_bo, der_wh, der_bh

In [58]:
def train_network(epochs=200):
    trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
    train_y, valid_y, test_y = convertOneHot(trainTarget, validTarget, testTarget)

    print ("trainTarget.shape: ",trainTarget.shape)
    print ("newtrain.shape: ", train_y.shape)
    s, l, h = trainData.shape #1000 samples, 28, 28
    F = l*h  #784 features
    c = 10  #10 classes
    xi = trainData.reshape(s, F)
    variance_h = 2/(F + s)
    variance_o = 2/(s + c)

    mean, stand_dev_h, stand_dev_o = 0, math.sqrt(variance_h), math.sqrt(variance_o), 

    Wh = np.random.normal(mean, stand_dev_h, (F, s)) #784,1000
    bh = np.zeros((1,s))                           #1, 10
    Wo = np.random.normal(mean, stand_dev_o, (s, c)) #1000, 10
    bo = np.zeros((1,c))                           #1, 10
    print (Wh.shape)    
    print (bh.shape)
    print (Wo.shape)
    print (bo.shape)

    v_Wh = np.full((F, s), 1e-5)
    v_bh = np.full((1, s), 1e-5)
    v_Wo = np.full((s, c), 1e-5)
    v_bo = np.full((1,c), 1e-5)

    gamma = 0.99
    learning_rate = 0.01

    i = 0
    while i < epochs:
        # forward propagate

        sh = computeLayer(xi, Wh, bh)
        xh = relu(sh)
        print ("xh's shape: ", xh.shape)
        so = computeLayer(xh, Wo, bo)
        yo = softmax(so)
        print ("yo's shape: ", yo.shape)
        
        print (sh.shape)
        print(xh.shape)
        print(so.shape)
        print(yo.shape)
        # backward propagate
        der_wo, der_bo, der_wh, der_bh = backPropagation(xi,sh,xh,so,Wo,yo,train_y)

        v_Wh = gamma*v_Wh + learning_rate*der_wh
        v_bh = gamma*v_bh + learning_rate*der_bh
        v_Wo = gamma*v_Wo + learning_rate*der_wo
        v_bo = gamma*v_bo + learning_rate*der_bo

        Wh = Wh - v_Wh
        bh = bh - v_bh
        Wo = Wo - v_Wo
        bo = bo - v_bo
    print(Wo)
    print(bo)
    print (Wh)
    print (bh)
    a, b = classify_result(trainData, trainTarget, Wo, bo)
    print(a)
    print(b)
    return


In [59]:
train_network()

trainTarget.shape:  (10000,)
newtrain.shape:  (10000, 10)
(784, 10000)
(1, 10000)
(10000, 10)
(1, 10)
xh's shape:  (10000, 10000)
yo's shape:  (10000, 10)
(10000, 10000)
(10000, 10000)
(10000, 10)
(10000, 10)
score.shape:  (10000, 10)
target.shape:  (10000, 10)


ValueError: shapes (10000,784) and (10000,10000) not aligned: 784 (dim 1) != 10000 (dim 0)