In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import math
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


# Load the data
def loadData():
    with np.load("notMNIST.npz") as data:
        Data, Target = data["images"], data["labels"]
        np.random.seed(521)
        randIndx = np.arange(len(Data))
        np.random.shuffle(randIndx)
        Data = Data[randIndx] / 255.0
        Target = Target[randIndx]
        trainData, trainTarget = Data[:10000], Target[:10000]
        validData, validTarget = Data[10000:16000], Target[10000:16000]
        testData, testTarget = Data[16000:], Target[16000:]
    return trainData, validData, testData, trainTarget, validTarget, testTarget

# Implementation of a neural network using only Numpy - trained using gradient descent with momentum


def convertOneHot(trainTarget, validTarget, testTarget):
    newtrain = np.zeros((trainTarget.shape[0], 10))
    newvalid = np.zeros((validTarget.shape[0], 10))
    newtest = np.zeros((testTarget.shape[0], 10))

    for item in range(0, trainTarget.shape[0]):
        newtrain[item][trainTarget[item]] = 1
    for item in range(0, validTarget.shape[0]):
        newvalid[item][validTarget[item]] = 1
    for item in range(0, testTarget.shape[0]):
        newtest[item][testTarget[item]] = 1
    return newtrain, newvalid, newtest


def shuffle(trainData, trainTarget):
    np.random.seed(421)
    randIndx = np.arange(len(trainData))
    target = trainTarget
    np.random.shuffle(randIndx)
    data, target = trainData[randIndx], target[randIndx]
    return data, target


def relu(x):
    # TODO
    return np.maximum(x, 0)


def softmax(x):
    # TODO
    return np.exp(x)/np.sum(np.exp(x))


def computeLayer(X, W, b):
    # TODO
    return np.matmul(X, W)+b


def CE(target, prediction):
    # TODO
    score = softmax(prediction)
    ce = np.sum(np.multiply(target, np.log(score)), axis=1)
    loss = -np.mean(ce)
    return loss


def gradCE(target, prediction):
    # TODO
    N = target.shape[0]
    score = softmax(prediction)
    print ("score.shape: ", score.shape)
    print ("target.shape: ",target.shape)
    res = score - prediction
    return res


def error(target, prediction):
    return np.multiply((target-prediction),(target-prediction))


def derivation_LW(last_X, y, target):
    grad_CE = gradCE(target, y)
    res = 2*np.multiply((np.multiply((y-target), grad_CE)), last_X)
    return res

  from ._conv import register_converters as _register_converters


In [2]:
    trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
    train_y, valid_y, test_y = convertOneHot(trainTarget, validTarget, testTarget)


In [3]:
    s, l, h = trainData.shape #1000 samples, 28, 28
    F = l*h  #784 features
    c = 10  #10 classes
    xi = trainData.reshape(s, F)
    variance_h = 2/(F + s)
    variance_o = 2/(s + c)

    mean, stand_dev_h, stand_dev_o = 0, math.sqrt(variance_h), math.sqrt(variance_o), 

    Wh = np.random.normal(mean, stand_dev_h, (F, s)) #784,1000
    bh = np.zeros((1,s))                           #1, 10
    Wo = np.random.normal(mean, stand_dev_o, (s, c)) #1000, 10
    bo = np.zeros((1,c))                           #1, 10

In [6]:
    del_Wh = np.full((F, s), 1e-5)
    del_bh = np.full((1, s), 1e-5)
    del_Wo = np.full((s, c), 1e-5)
    del_bo = np.full((1,c), 1e-5)

    gamma = 0.99
    learning_rate = 0.01

In [5]:
        sh = computeLayer(xi, Wh, bh)
        xh = relu(sh)
        print ("xh's shape: ", xh.shape)
        so = computeLayer(xh, Wo, bo)
        yo = softmax(so)
        print ("yo's shape: ", yo.shape)

xh's shape:  (10000, 10000)
yo's shape:  (10000, 10)


In [7]:
grad_ce = gradCE(train_y,yo)

score.shape:  (10000, 10)
target.shape:  (10000, 10)


In [14]:
der_wo = np.dot(xh,grad_ce)

In [15]:
der_bo = np.dot(1,grad_ce)

In [20]:
der_wh = np.dot(grad_ce,np.transpose(Wo))

In [32]:
print(der_wh.shape)
print (sh.shape)

(10000, 10000)
(10000, 10000)


In [27]:
der_wh1 = np.where(sh>0,der_wh,0)

In [41]:
der_wh

array([[ 0.00000000e+00,  1.72030639e-08,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00, -9.33761943e-09],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.16707038e-07,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         3.46566745e-08,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -1.98201867e-08, -1.31618474e-07,  3.22783158e-08],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -6.73429272e-08,  6.28130318e-08],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         2.70679905e-09,  0.00000000e+00,  3.39311141e-08]])

In [42]:
sh

array([[-0.04127626,  0.21212242, -0.28328297, ..., -0.02019434,
        -0.01616141,  0.19211828],
       [-0.10275515, -0.19528012, -0.2012543 , ...,  0.00223937,
        -0.07819647, -0.12196877],
       [-0.13868235, -0.03568546, -0.16692097, ...,  0.11587752,
        -0.23398471, -0.2237074 ],
       ...,
       [-0.04827196, -0.01571747, -0.32296424, ...,  0.15042049,
         0.09571025,  0.36415159],
       [-0.13641463, -0.2874804 , -0.14622749, ..., -0.1834559 ,
         0.01378742,  0.20048749],
       [-0.10550014, -0.09018722, -0.12940096, ...,  0.14237806,
        -0.01479827,  0.06382265]])

In [43]:
der_wh1

array([[ 0.00000000e+00,  1.72030639e-08,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00, -9.33761943e-09],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.16707038e-07,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         3.46566745e-08,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -1.98201867e-08, -1.31618474e-07,  3.22783158e-08],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -6.73429272e-08,  6.28130318e-08],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         2.70679905e-09,  0.00000000e+00,  3.39311141e-08]])

In [40]:
a= np.array([[ -0.04127626,  -0.04127626,  0.21212242],
       [ 0,  2, -1],
       [ 0,  3, -1]])
b = np.array([[ 10,  11,  12],
       [ 10,  12, 11],
       [ 10,  13, 11]])
np.where(a>0,b,0)

array([[ 0,  0, 12],
       [ 0, 12,  0],
       [ 0, 13,  0]])

In [44]:
def train_network(epochs=200):
    trainData, validData, testData, trainTarget, validTarget, testTarget = loadData()
    train_y, valid_y, test_y = convertOneHot(trainTarget, validTarget, testTarget)

    print ("trainTarget.shape: ",trainTarget.shape)
    print ("newtrain.shape: ", train_y.shape)
    s, l, h = trainData.shape #1000 samples, 28, 28
    F = l*h  #784 features
    c = 10  #10 classes
    xi = trainData.reshape(s, F)
    variance_h = 2/(F + s)
    variance_o = 2/(s + c)

    mean, stand_dev_h, stand_dev_o = 0, math.sqrt(variance_h), math.sqrt(variance_o), 

    Wh = np.random.normal(mean, stand_dev_h, (F, s)) #784,1000
    bh = np.zeros((1,s))                           #1, 10
    Wo = np.random.normal(mean, stand_dev_o, (s, c)) #1000, 10
    bo = np.zeros((1,c))                           #1, 10


    v_Wh = np.full((F, s), 1e-5)
    v_bh = np.full((1, s), 1e-5)
    v_Wo = np.full((s, c), 1e-5)
    v_bo = np.full((1,c), 1e-5)

    gamma = 0.99
    learning_rate = 0.01

    i = 0
    while i < epochs:
        # forward propagate

        sh = computeLayer(xi, Wh, bh)
        xh = relu(sh)
        print ("xh's shape: ", xh.shape)
        so = computeLayer(xh, Wo, bo)
        yo = softmax(so)
        print ("yo's shape: ", yo.shape)

        # backward propagate
        der_wo, der_bo, der_wh, der_bh = backPropagation(xi,sh,xh,so,Wo,yo,train_y)

        v_Wh = gamma*v_Wh + learning_rate*der_wh
        v_bh = gamma*v_bh + learning_rate*der_bh
        v_Wo = gamma*v_Wo + learning_rate*der_wo
        v_bo = gamma*v_bo + learning_rate*der_bo

        Wh = Wh - v_Wh
        bh = bh - v_bh
        Wo = Wo - v_Wo
        bo = bo - v_bo
    print(Wo)
    print(bo)
    print (Wh)
    print (bh)
    a, b = classify_result(trainData, trainTarget, Wo, bo)
    print(a)
    print(b)
    return


In [None]:
train_network()