# Multi Layer Perceptron

In [1]:
import numpy as np
import math

In [2]:
import math  # This library is to use normal math functions like exp,sin etc.

In [3]:
# Mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Preparing Dataset

In [4]:
#below where the file is in gdrive, change with your
data_path = "/content/drive/MyDrive/Colab Notebooks/PRNN_A1/Prnn_datasets/"
dataset = np.loadtxt(data_path + 'PCA_MNIST.csv', delimiter=',',skiprows=1)

In [5]:
dataset.shape

(60000, 11)

In [6]:
# Normalising dataset needed to prevent exponent going to zero
for i in range(1,dataset.shape[1]):
  dataset[:,i] = (dataset[:,i]-dataset[:,i].min())/(dataset[:,i].max()-dataset[:,i].min())

In [7]:
count = 40000
X_train = dataset[0:count,1:]
Y_train =dataset[0:count,0]
X_test = dataset[count:,1:]
Y_test = dataset[count:,0]

In [8]:
classes = 10

## Creating Neural Network

In [9]:
class neural_net:
  def __init__(self,input_size,neurons):
    #self.weights = 0.1 * np.ones((input_size,neurons))
    self.weights =  np.random.randn(input_size,neurons)
    self.biases = np.zeros((neurons,1))
  def forward(self,inputs):
    self.inputs = inputs
    self.output = self.weights.T@inputs + self.biases
  def backward(self,dvalues):
    self.dweights = self.inputs@dvalues.T  #gives a matrix
    self.dbiases =  dvalues

    self.dinputs = self.weights@dvalues

In [10]:
class sigmoid_activation:
  def forward(self,input):
    self.inputs = input
    self.output = 1/(1+np.exp(-input))
  def backward(self,dvalues):
    self.dinputs = dvalues * self.output * (1-self.output)

In [11]:
class relu_activation:
  def forward(self,input):
    self.inputs = input
    self.output = np.maximum(0,input)
    #self.output = np.minimum(self.output,1)
  def backward(self,dvalues):
    self.dinputs = dvalues * np.greater(self.inputs, 0).astype(int)
    #self.dinputs = self.dinputs * np.less(self.dinputs, 1).astype(int)

In [12]:
class softmax_activation:
  def forward(self,input):
    self.input = input
    exp_values = np.exp(input)
    prob = exp_values/(np.sum(exp_values))
    self.output = prob
  def backward(self,dvalues):
    s =dvalues.shape[0]
    jac = np.zeros((s,s))
    for i in range(s):
      for j in range(s):
        k=0
        if i==j:
          k=1
        jac[i][j]= self.output[i][0]*(k-self.output[j][0])  #ddi/dnj
    self.dinputs = jac.T@dvalues

In [13]:
class grad_des_optimizer:
  def __init__(self,alpha = 0.01):
    self.alpha = alpha
  def store_params(self,layer):
    if not hasattr(layer,'sgd_flag'):
      layer.sgd_flag = 1
      #print("Hi")
      layer.dweight_curr_itr_sum = np.zeros_like(layer.weights)
      layer.dbias_curr_itr_sum = np.zeros_like(layer.biases)

    layer.dweight_curr_itr_sum = layer.dweight_curr_itr_sum + layer.dweights
    layer.dbias_curr_itr_sum = layer.dbias_curr_itr_sum + layer.dbiases
  def update(self,layer,batch_size):
    layer.weights = layer.weights -  self.alpha * layer.dweight_curr_itr_sum/batch_size
    layer.biases = layer.biases -  self.alpha * layer.dbias_curr_itr_sum/batch_size

    layer.dweight_curr_itr_sum = np.zeros_like(layer.weights)
    layer.dbias_curr_itr_sum = np.zeros_like(layer.biases)

In [14]:
class adam_optimizer:
  def __init__(self,alpha = 0.01,beta_1=0.9,beta_2=0.9):
    self.alpha = alpha
    self.beta_1 = beta_1
    self.beta_2 = beta_2
  def store_params(self,layer):
    if not hasattr(layer,'adam_flag'):
      layer.adam_flag=1
      layer.dweight_sum = np.zeros_like(layer.weights)
      layer.dbias_sum = np.zeros_like(layer.biases)
      layer.dweight_sq_sum = np.zeros_like(layer.weights)
      layer.dbias_sq_sum = np.zeros_like(layer.biases)

      layer.dweight_curr_itr_sum = np.zeros_like(layer.weights)
      layer.dbias_curr_itr_sum = np.zeros_like(layer.biases)

    layer.dweight_sum = self.beta_1*layer.dweight_sum+(1-self.beta_1)*layer.dweights
    layer.dbias_sum =  self.beta_1*layer.dbias_sum+(1-self.beta_1)*layer.dbiases
    layer.dweight_sq_sum = self.beta_2*layer.dweight_sq_sum+(1-self.beta_2)*layer.dweights**2
    layer.dbias_sq_sum =  self.beta_2*layer.dbias_sq_sum+(1-self.beta_2)*layer.dbiases**2
    layer.dweight_curr_itr_sum = layer.dweight_curr_itr_sum + layer.dweights
    layer.dbias_sq_sum = layer.dbias_sq_sum + layer.dbiases
  def update(self,layer,batch_size):
    layer.weights = layer.weights -  self.alpha* layer.dweight_curr_itr_sum/batch_size
    layer.biases = layer.biases -  self.alpha * layer.dbias_curr_itr_sum/batch_size

    layer.dweight_curr_itr_sum = np.zeros_like(layer.weights)
    layer.dbias_curr_itr_sum = np.zeros_like(layer.biases)

## Training with Squared error loss

Here is the process to train the above classes of neural nets and use them you can choose parameters in below steps according to your choice

In [17]:
sgd_opt = grad_des_optimizer(0.1) # This constant step size is choosen by trial and error

In [18]:
# creating 3 layer neural net
layer1 = neural_net(10,50)
act1 = sigmoid_activation()
layer2 = neural_net(50,25)
act2 = sigmoid_activation()
layer3 = neural_net(25,10)

In [19]:
#Training
nb = 1  #batch size
for epochs in range(20):
  loss = 0
  for i in range(X_train.shape[0]):

    x=X_train[i,:]
    y = Y_train[i]
    x = x.reshape((x.shape[0],1))
    yl = np.zeros((classes,1))          # y is label
    yl[int(y)][0]=1

    layer1.forward(x)    # x is feature vector
    act1.forward(layer1.output)
    layer2.forward(act1.output)
    act2.forward(layer2.output)
    layer3.forward(act2.output)
    pred = layer3.output
    
    loss = loss+np.linalg.norm(pred-yl)
    dinput_l = (pred-yl)     # Check the sign here else will diverge, it is squared errors gradient

    layer3.backward(dinput_l)
    sgd_opt.store_params(layer3)

    act2.backward(layer3.dinputs)

    layer2.backward(act2.dinputs)
    sgd_opt.store_params(layer2)

    act1.backward(layer2.dinputs)

    layer1.backward(act1.dinputs)
    sgd_opt.store_params(layer1)

    if(i%nb==0):
      sgd_opt.update(layer3,nb)
      sgd_opt.update(layer2,nb)
      sgd_opt.update(layer1,nb)

  print(loss)

  # opt.update(layer3,2)
  # opt.update(layer2,2)
  # opt.update(layer1,2)

28745.098063529433
18394.33669513019
15386.301971825073
14586.100368465308
14063.013494455547
13620.101088903328
13232.590632113346
12897.565650435008
12598.195460709798
12326.006796266829
12075.741882531656
11843.627845919847
11626.63815712184
11422.136846387892
11227.484493485514
11040.458821642507
10861.051876421514
10691.135794521319
10531.893246129359
10384.108533645149


In [20]:
#Testing
conf_matrix = np.zeros((classes,classes))
loss = 0
count = 0
for i in range(X_test.shape[0]):
  x = X_test[i,:]
  y = Y_test[i]
  x = x.reshape((x.shape[0],1))
  yl = np.zeros((classes,1))          # y is label
  yl[int(y)][0]=1

  layer1.forward(x)    # x is feature vector
  act1.forward(layer1.output)
  layer2.forward(act1.output)
  act2.forward(layer2.output)
  layer3.forward(act2.output)
  pred = layer3.output

  loss = loss+np.linalg.norm(pred-yl)
  if np.argmax(pred)==int(y):
    count+=1
  k = np.argmax(pred)
  v = int(y)
  conf_matrix[v][k] += 1
print(loss)
print(count * 100 /X_test.shape[0])


5754.1571336063935
92.92


## Training with cross entropy loss

In [21]:
sgd_opt = grad_des_optimizer()

In [22]:
# creating 3 layer neural net
layer1 = neural_net(10,50)
act1 = sigmoid_activation()
layer2 = neural_net(50,25)
act2 = sigmoid_activation()
layer3 = neural_net(25,10)
softmax = softmax_activation()

In [23]:
#Training
nb = 1  #batch size
for epochs in range(20):
  loss = 0
  for i in range(X_train.shape[0]):

    x=X_train[i,:]
    y = Y_train[i]
    x = x.reshape((x.shape[0],1))
    yl = np.zeros((classes,1))          # y is label
    yl[int(y)][0]=1

    layer1.forward(x)    # x is feature vector
    act1.forward(layer1.output)
    layer2.forward(act1.output)
    act2.forward(layer2.output)
    layer3.forward(act2.output)
    softmax.forward(layer3.output)
    pred = softmax.output
    
    loss = loss-math.log(pred[int(y)])
    dinput_l = -yl * 1/pred     # take loss derivative wrt pred, it is cross entropy loss gradient

    softmax.backward(dinput_l)

    layer3.backward(softmax.dinputs)
    sgd_opt.store_params(layer3)

    act2.backward(layer3.dinputs)

    layer2.backward(act2.dinputs)
    sgd_opt.store_params(layer2)

    act1.backward(layer2.dinputs)

    layer1.backward(act1.dinputs)
    sgd_opt.store_params(layer1)

    if(i%nb==0):
      sgd_opt.update(layer3,nb)
      sgd_opt.update(layer2,nb)
      sgd_opt.update(layer1,nb)

  print(loss)

  # opt.update(layer3,2)
  # opt.update(layer2,2)
  # opt.update(layer1,2)

34819.31707318891
14735.776289706706
12745.58835445112
11693.636769428529
10950.60169713977
10375.246006246432
9911.843359016253
9527.952788717914
9204.8666622844
8931.67726381843
8700.236996402935
8503.13899787281
8333.4905700754
8185.277295503866
8053.608953889134
7934.696557741912
7825.666273092317
7724.33590403251
7629.0160056697305
7538.367229042178


In [24]:
#Testing
conf_matrix = np.zeros((classes,classes))
loss = 0
count = 0
for i in range(X_test.shape[0]):
  x = X_test[i,:]
  y = Y_test[i]
  x = x.reshape((x.shape[0],1))
  yl = np.zeros((classes,1))          # y is label
  yl[int(y)][0]=1

  layer1.forward(x)    # x is feature vector
  act1.forward(layer1.output)
  layer2.forward(act1.output)
  act2.forward(layer2.output)
  layer3.forward(act2.output)
  softmax.forward(layer3.output)
  pred = softmax.output

  loss = loss-math.log(pred[int(y)])
  if np.argmax(pred)==int(y):
    count+=1
  k = np.argmax(pred)
  v = int(y)
  conf_matrix[v][k] += 1
print(loss)
print(count * 100 /X_test.shape[0])


4339.391546049375
92.525
