In [0]:
!wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz

In [0]:
!gunzip t*-ubyte.gz

In [0]:
import struct
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime

In [0]:
path = os.path.join(os.path.expanduser('~'), 'Documents', 'OR 610')
def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        return np.fromstring(f.read(), dtype=np.uint8).reshape(shape)
    
def oneHotEncoding(label):
    n = np.max(label)+1
    v = np.eye(n)[label]
    return v.T


def imageProcess(data):
    data = data/255
    data = data.reshape(data.shape[0],data.shape[1]*data.shape[2])
    return data.T

In [0]:
def softMax(X):
    e = np.exp(X)
    p = e/np.sum(e, axis=0)
    return p

def ReLU(z):
    return np.maximum(0,z)
def sigmoid(z):
    return 1./(1.+np.exp(-z))
def tanh(z):
    return np.tanh(z)

In [0]:
def dReLU(z):
    return (z > 0) * 1
def dSigmoid(z):
    return sigmoid(z) *(1-sigmoid (z))
def dTanh(z):
    return 1/(np.cosh(z)**2)

In [0]:
def crossEntropyR2(y, y_hat, lamda, params):
    m = y.shape[1]
    s=0
    for i in range(len(params)//2):
      s+=np.sum(params['W'+str(i+1)]**2)
   
    cost = -(1/m) * np.sum(y*np.log(y_hat)) + lamda/(2*m) * (s)
    
    return cost


def forward(X,params,activation):
    n_layers=len(params)//2
    forwardPass = {}
   
    forwardPass['Z1'] = np.matmul(params['W1'], X) + params['b1']
    forwardPass['A1'] = activation(forwardPass['Z1'])

    for i in range(1,n_layers):
        forwardPass['Z'+str(i+1)] = np.matmul(params['W'+str(i+1)], forwardPass['A'+str(i)]) + params['b'+str(i+1)]
        forwardPass['A'+str(i+1)] = activation(forwardPass['Z'+str(i+1)])
    return forwardPass


def back(X, y,forwardPass, params,dActivation):
    m = X.shape[1]
    gradient = {}
    n_layers=len(params)//2

    gradient['dZ'+str(n_layers)] = forwardPass['A'+str(n_layers)] - y

    gradient['dW'+str(n_layers)] = (1./m) * np.matmul(gradient['dZ'+str(n_layers)], forwardPass['A'+str(n_layers-1)].T)

    gradient['db'+str(n_layers)] = (1./m) * np.sum(gradient['dZ'+str(n_layers)], axis=1, keepdims=True)
   
    for i in reversed(range(1,n_layers-1)):
      gradient['dA'+str((i+1))] = np.matmul(params['W'+str((i+1)+1)].T, gradient['dZ'+str((i+1)+1)])
      gradient['dZ'+str((i+1))] = gradient['dA'+str((i+1))] * dActivation(forwardPass['Z'+str((i+1))])
      gradient['dW'+str((i+1))] = (1./m) * np.matmul(gradient['dZ'+str((i+1))], forwardPass['A'+str(i)].T)
      gradient['db'+str((i+1))] = (1./m) * np.sum(gradient['dZ'+str((i+1))])
    i=0
    gradient['dA'+str((i+1))] = np.matmul(params['W'+str((i+1)+1)].T, gradient['dZ'+str((i+1)+1)])
    gradient['dZ'+str((i+1))] = gradient['dA'+str((i+1))] * dActivation(forwardPass['Z'+str((i+1))])
    gradient['dW'+str((i+1))] = (1./m) * np.matmul(gradient['dZ'+str((i+1))], X.T)
    gradient['db'+str((i+1))] = (1./m) * np.sum(gradient['dZ'+str((i+1))])
    return gradient

def updater(params,grad,eta,lamda,m):
    updatedParams = {}
    n_layers=len(params)//2
    for i in range(n_layers):
      updatedParams['W'+str((i+1))] = params['W'+str((i+1))] - eta * grad['dW'+str((i+1))] - (params['W'+str((i+1))]*lamda*eta)/m
      updatedParams['b'+str((i+1))] = params['b'+str((i+1))] - eta * grad['db'+str((i+1))]

    return updatedParams

def classifer(X, params,activation):
    n_layers=len(params)//2
    for i in reversed(n_layers-1):
      Z = np.matmul(params['W'+str(i+1)], X) + params['b'+str(i+1)]
      A = activation(Z)

    Z = np.matmul(params['W'+str(n_layers)],A) + params['b'+str(n_layers)]
    A = softMax(Z)
    pred = np.argmax(A, axis=0)
    return pred

In [9]:
X_train = imageProcess(read_idx('train-images-idx3-ubyte'))
y_train = oneHotEncoding(read_idx('train-labels-idx1-ubyte'))
X_test = imageProcess(read_idx('t10k-images-idx3-ubyte'))
y_test = read_idx('t10k-labels-idx1-ubyte')

#### General Hyperparameters
m=10000 #batch size
n_x = X_train.shape[0]
n_h = 100
eta = 1
lamda = 2
np.random.seed(7)
epoch = 1000

  


In [0]:
#m = X_train.shape[1]
#Initializing weightss
sigmoidParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(1. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(1. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    print(i)
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,sigmoidParams,sigmoid)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, sigmoidParams)
   
    #back Prop
    gradient = back(X, y, forwardPass, sigmoidParams,dSigmoid)
    #updating weights
    sigmoidParams=updater(sigmoidParams,gradient,eta,lamda,m)

difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, sigmoidParams, sigmoid)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))

In [0]:
#######RELU SECTION ############
reluParams = {'W1': np.random.randn(n_h, n_x)* np.sqrt(2. / n_x),
                 'b1': np.zeros((n_h, 1)),
                 'W2': np.random.randn(10, n_h)* np.sqrt(2. / n_h),
                 'b2': np.zeros((10, 1))
                 }

start = datetime.now()
for i in range(epoch):
    #shuffle batch index
    idx = np.random.permutation(X_train.shape[1])[:m]
    X=X_train[:,idx]
    y=y_train[:,idx]
    #forward pass
    forwardPass = forward(X,reluParams,ReLU)
    #cost
    cost = crossEntropyR2(y, forwardPass['A2'], lamda, reluParams)
    #back Prop
    gradient = back(X, y, forwardPass, reluParams,dReLU)
    #updating weights
    reluParams=updater(reluParams,gradient,eta,lamda,m)
difference = datetime.now() - start
print("Final cost:", cost)
print('time to train:', difference)

y_hat = classifer(X_test, reluParams, ReLU)


print('Accuracy:',sum(y_hat==y_test)*1/len(y_test))