In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

In [2]:
df = pd.read_csv("./data/mnist_train.csv", header=None)

In [3]:
training = df.as_matrix()

In [4]:
Y_train = training[:, 0].astype('int')

In [5]:
X_train = training[:, 1:].astype('float64')

In [7]:
df = pd.read_csv("./data/mnist_test.csv", header=None)

In [8]:
test = df.as_matrix()

In [9]:
Y_test = test[:, 0].astype('int')

In [10]:
X_test = test[:, 1:].astype('float64')

In [12]:
X_test/=255

In [13]:
X_train/=255

In [None]:
def reLu(x, derivative=False):
    if(derivative==False):
        return x*(x > 0)
    else:
        return 1*(x > 0)

In [16]:
def softmax(x):
    x -= np.max(x)
    sm = (np.exp(x).T / np.sum(np.exp(x),axis=1)).T
    return sm

In [17]:
def initialize_parameters_he(layers_dims):
    weights = {}
    L = len(layers_dims)
    for l in range(1, L):
        weights['w' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        weights['b' + str(l)] = np.zeros((layers_dims[l], 1))
    return weights

In [18]:
def dropout(x, keep_prop):
    mask = np.random.binomial([np.ones_like(x)],(1-keep_prop))[0]  / (1-keep_prop)
    return x*mask

In [None]:
def predict(weights, x, keep_prop=0):
    
    L = len(weights) // 2
    output = []
    output[0] = dropout(relu(x@weights['w1']+weights['b1']), keep_prop)
    for l in range(1, L-1):
        output[l] = dropout(relu(output[l-1]@weights['w' + str(l+1)]+weights['b' + str(l+1)]), keep_prop)
    output[L-1] = softmax(output[L-2]@weights['w' + str(L)]+weights['b' + str(L)])
    return output

In [None]:
def accuracy(output, y):
    hit = 0
    output = np.argmax(output, axis=1)
    y = np.argmax(y, axis=1)
    for y in zip(output, y):
        if(y[0]==y[1]):
            hit += 1

    p = (hit*100)/output.shape[0]
    return p

In [None]:
def log2(x):
    if(x!=0):
        return np.log(x)
    else:
        return -np.inf
    
def log(y):
    return [[log2(nx) for nx in x]for x in y]

def cost(Y_predict, Y_right):
    
    Loss = -np.mean((np.nan_to_num(Y_right*log(Y_predict)) + np.nan_to_num((1-Y_right)*log(1-Y_predict))),keepdims=True)
    return Loss

In [None]:
x_train = X_train

In [None]:
x_valid = X_test

In [None]:
d_train = Y_train

In [None]:
d_valid = Y_test

In [None]:
def ADAM(weights, x, t, outputs, eta, beta1, beta2, eps, i, nabla, cache=None):        
    L = len(weights) // 2
    if(cache==None):
        for l in range(1, L):
            pars['vw' + str(l)] = np.zeros_like(weights['w' +str(l)])
            vw1 = np.zeros_like(w1)
            mw1 = np.zeros_like(w1)
            
            vw2 = np.zeros_like(w2)
            mw2 = np.zeros_like(w2)
            
            vw3 = np.zeros_like(w3)
            mw3 = np.zeros_like(w3)
            
            vb1 = np.zeros_like(b1)
            mb1 = np.zeros_like(b1)
            
            vb2 = np.zeros_like(b2)
            mb2 = np.zeros_like(b2)
            
            vb3 = np.zeros_like(b3)
            mb3 = np.zeros_like(b3)
    else:
        vw1,mw1,vw2,mw2,vw3,mw3,vb1,mb1,vb2,mb2,vb3,mb3 = cache
    
    first, second, y = outputs
   
    w3_delta = (t-y)/x.shape[0]
    
    w2_error = w3_delta@w3.T

    w2_delta = w2_error * ReLu(second,derivative=True)

    w1_error = w2_delta@w2.T
    w1_delta = w1_error * ReLu(first,derivative=True)
    
    
    dw3 = (second.T@w3_delta + nabla*w3)
    mw3 = beta1*mw3 + (1-beta1)*dw3
    mt = (mw3) / (1-beta1**i)
    vw3 = beta2*vw3 + (1-beta2)*(dw3**2)
    vt = (vw3) / (1-beta2**i)
    w3 += eta * mt/(np.sqrt(vt) + eps)
    
    db3 = (w3_delta.sum(axis=0)+ nabla*b3)
    mb3 = beta1*mb3 + (1-beta1)*db3
    mt = (mb3) / (1-beta1**i)
    vb3 = beta2*vb3 + (1-beta2)*(db3**2)
    vt = (vb3) / (1-beta2**i)
    b3 += eta * mt/(np.sqrt(vt) + eps)
    
    dw2 = (first.T@w2_delta + nabla*w2)
    mw2 = beta1*mw2 + (1-beta1)*dw2
    mt = (mw2) / (1-beta1**i)
    vw2 = beta2*vw2 + (1-beta2)*(dw2**2)
    vt = (vw2) / (1-beta2**i)
    w2 += eta * mt/(np.sqrt(vt) + eps)
    
    db2 = (w2_delta.sum(axis=0) + nabla*b2)
    mb2 = beta1*mb2 + (1-beta1)*db2
    mt = (mb2) / (1-beta1**i)
    vb2 = beta2*vb2 + (1-beta2)*(db2**2)
    vt = (vb2) / (1-beta2**i)
    b2 += eta * mt/(np.sqrt(vt) + eps)
    
    dw1 = (x.T@w1_delta + nabla*w1)
    mw1 = beta1*mw1 + (1-beta1)*dw1
    mt = (mw1) / (1-beta1**i)
    vw1 = beta2*vw1 + (1-beta2)*(dw1**2)
    vt = (vw1) / (1-beta2**i)
    w1 += eta * mt/(np.sqrt(vt) + eps)
    
    db1 = (w1_delta.sum(axis=0) + nabla*b1)
    mb1 = beta1*mb1 + (1-beta1)*db1
    mt = (mb1) / (1-beta1**i)
    vb1 = beta2*vb1 + (1-beta2)*(db1**2)
    vt = (vb1) / (1-beta2**i)
    b1 += eta * mt/(np.sqrt(vt) + eps)
    
    
    weights = [w1,w2,w3,b1,b2,b3]
    cache = [vw1,mw1,vw2,mw2,vw3,mw3,vb1,mb1,vb2,mb2,vb3,mb3]
    
    return weights, cache