<a href="https://colab.research.google.com/github/GauravKanwat/CS6910_Assignment1/blob/main/DL_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
import matplotlib.pyplot as plt
from time import sleep
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split

In [14]:
class NeuralNetwork:
    def __init__(self, num_of_pixels, hidden_neurons_list, num_hidden_layers, output_neurons):
      self.num_of_pixels = num_of_pixels
      self.hidden_neurons_list = hidden_neurons_list
      self.num_hidden_layers = num_hidden_layers
      self.output_neurons = output_neurons


    def initialize_parameters(self, num_of_pixels, hidden_neurons_list, num_hidden_layers, output_neurons, initialization):
      '''
          Initializing the weights and biases, both are dictionary which are storing random values generated by rand between (0 to 1) and subtracting 0.5 from it makes it between
          -0.5 to 0.5
      '''

      weights = {}
      biases = {}
      num_layers = len(hidden_neurons_list) + 1

      if initialization == "xavier":
        weights[0] = np.random.randn(hidden_neurons_list[0], num_of_pixels) * np.sqrt(1 / num_of_pixels)
        biases[0] = np.zeros((hidden_neurons_list[0], 1))

        # Initialize weights and biases for hidden layers
        for l in range(1, len(hidden_neurons_list)):
          weights[l] = np.random.randn(hidden_neurons_list[l], hidden_neurons_list[l-1]) * np.sqrt(1 / hidden_neurons_list[l-1])
          biases[l] = np.zeros((hidden_neurons_list[l], 1))

        # Initialize weights for last hidden layer to output layer
        weights[len(hidden_neurons_list)] = np.random.randn(output_neurons, hidden_neurons_list[-1]) * np.sqrt(1 / hidden_neurons_list[-1])
        biases[len(hidden_neurons_list)] = np.zeros((output_neurons, 1))
        return weights, biases

      else:
        weights[0] = np.random.rand(hidden_neurons_list[0], num_of_pixels) - 0.5
        biases[0] = np.random.rand(hidden_neurons_list[0], 1) - 0.5
        for l in range(num_hidden_layers):
          weights[l] = np.random.rand(hidden_neurons_list[l], num_of_pixels if l == 0 else hidden_neurons_list[l-1]) - 0.5
          biases[l] = np.random.rand(hidden_neurons_list[l], 1) - 0.5
        weights[num_hidden_layers] = np.random.rand(output_neurons, hidden_neurons_list[-1]) - 0.5
        biases[num_hidden_layers] = np.random.rand(output_neurons, 1) - 0.5
      return weights, biases

    def sigmoid(self, x):
      sigmoid_x = np.where(x < -30, 1, 1 / (1 + np.exp(-x)))
      return sigmoid_x

    def reLU(self, Z):
        return np.maximum(0, Z)

    def tanh(self, x):
      return np.tanh(x)

    def softmax(self, x):
        max_x = np.max(x, axis=0)
        exp_x = np.exp(x - max_x)  # avoiding overflow
        return exp_x / np.sum(exp_x, axis=0)

    def feedforward_propagation(self, X, weights, biases, num_hidden_layers, activation_function):
      a = []
      h = []

      for k in range(num_hidden_layers):
          if k == 0:
            a.append(np.dot(weights[k], X) + biases[k])
            if(activation_function == "reLU"):
              h.append(self.reLU(a[k]))
            elif(activation_function == "sigmoid"):
              h.append(self.sigmoid(a[k]))
            elif(activation_function == "tanh"):
              h.append(self.tanh(a[k]))
          else:
            a.append(np.dot(weights[k], h[k-1]) + biases[k])
            if(activation_function == "reLU"):
              h.append(self.reLU(a[k]))
            elif(activation_function == "sigmoid"):
              h.append(self.sigmoid(a[k]))
            elif(activation_function == "tanh"):
              h.append(self.tanh(a[k]))

      a.append(np.dot(weights[num_hidden_layers], h[num_hidden_layers - 1]) + biases[num_hidden_layers])
      y_hat = self.softmax(a[-1])
      return a, h, y_hat

    def one_hot(self, Y):
      if Y.max() != 9:
        one_hot_Y = np.zeros((Y.size, 10))
      else:
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
      one_hot_Y[np.arange(Y.size), Y] = 1
      one_hot_Y = one_hot_Y.T
      return one_hot_Y

    def deriv_sigmoid(self, Z):
      func = self.sigmoid(Z)
      return func * (1 - func)

    def deriv_reLU(self, Z):
      return Z > 0

    def deriv_tanh(self, x):
      return 1 - np.tanh(x)**2
      # sechX = 1 / np.cosh(x)
      # return sechX ** 2

    def back_propagation(self, Y, fwd_A, fwd_H, weights, biases, pred_output, num_hidden_layers, activation_function):
      one_hot_Y = self.one_hot(Y)
      dA = {}
      dH = {}
      dW = {}
      dB = {}

      dA[num_hidden_layers] = pred_output - one_hot_Y

      for k in range(num_hidden_layers, 0, -1):
        dW[k] = np.dot(dA[k], fwd_H[k-1].T)
        dB[k] = np.mean(dA[k], axis=1, keepdims=True)

        dH[k-1] = np.dot(weights[k].T, dA[k])
        if(activation_function == "reLU"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_reLU(fwd_A[k-1]))
        elif(activation_function == "sigmoid"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_sigmoid(fwd_A[k-1]))
        elif(activation_function == "tanh"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_tanh(fwd_A[k-1]))
      return dW, dB

    def get_predictions(self, pred_output):
      return np.argmax(pred_output, axis = 0)

    def get_accuracy(self, y_pred, y_true):
      return np.sum(y_pred == y_true) / y_true.size

    def cross_entropy(self, y_pred, y_true):
     epsilon = 1e-15
     loss = -np.mean(np.sum(y_true * np.log(y_pred + epsilon), axis=0))
     return loss

    def gradient_descent(self, weights, biases, dW, dB, eta):
      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        weights[l] -= eta * dW[l]
        biases[l] -= eta * dB[l]

      return weights, biases

    def momentum_based_gradient_descent(self, weights, biases, dW, dB, epochs, eta, beta):
      prev_uw = {}
      prev_ub = {}

      for l in range(1, self.num_hidden_layers + 1):
        prev_uw[l] = 0
        prev_ub[l] = 0
      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        uw = beta * prev_uw[l] + eta * dW[l]
        ub = beta * prev_ub[l] + eta * dB[l]
        weights[l] -= uw
        biases[l] -= ub
        prev_uw[l] = uw
        prev_ub[l] = ub
      return weights, biases

    def nesterov_accelerated_gradient_descent(self, weights, biases, dW, dB, epochs, eta, beta):
      prev_vw = 0
      prev_vb = 0
      v_w = beta*prev_vw
      v_b = beta*prev_vb

      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        vw = beta * prev_vw + eta * dW[l]
        vb = beta * prev_vb + eta * dB[l]
        weights[l] -= vw
        biases[l] -= vb
        prev_uw = vw
        prev_ub = vb

      return weights, biases

    def adagrad_gradient_descent(self, weights, biases, dW, dB, eta, eps):
      v_w = {}
      v_b = {}

      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = 0
        v_b[l] = 0
      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = v_w[l] + dW[l]**2
        v_b[l] = v_b[l] + dB[l]**2
        weights[l] -= eta * dW[l] / (np.sqrt(v_w[l]) + eps)
        biases[l] -= eta * dB[l] / (np.sqrt(v_b[l]) + eps)

      return weights, biases

    def rmsProp_gradient_descent(self, weights, biases, dW, dB, epochs, eta, eps, beta):
      v_w = {}
      v_b = {}

      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = 0
        v_b[l] = 0

      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = (beta * v_w[l]) + ((1-beta) * dW[l] ** 2)
        v_b[l] = (beta * v_b[l]) + ((1-beta) * dB[l] ** 2)

        weights[l] -= eta * dW[l] / (np.sqrt(v_w[l]) + eps)
        biases[l] -= eta * dB[l] / (np.sqrt(v_b[l]) + eps)

      return weights, biases

    def adam_gradient_descent(self, weights, biases, dW, dB, epochs, eta, eps, beta1, beta2):
      m_w = {}
      m_b = {}
      v_w = {}
      v_b = {}
      m_w_hat = {}
      m_b_hat = {}
      v_w_hat = {}
      v_b_hat = {}

      for l in range(1, self.num_hidden_layers + 1):
        m_w[l] = 0
        m_b[l] = 0
        v_w[l] = 0
        v_b[l] = 0
        m_w_hat[l] = 0
        m_b_hat[l] = 0
        v_w_hat[l] = 0
        v_b_hat[l] = 0

      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        m_w[l] = (beta1 * m_w[l]) + (1-beta1) * dW[l]
        m_b[l] = (beta1 * m_b[l]) + (1-beta1) * dB[l]

        v_w[l] = beta2 * v_w[l] + (1 - beta2) * (dW[l] ** 2)
        v_b[l] = beta2 * v_b[l] + (1 - beta2) * (dB[l] ** 2)

        m_w_hat[l] = m_w[l]/(1-np.power(beta1, l+1))
        m_b_hat[l] = m_b[l]/(1-np.power(beta1, l+1))
        v_w_hat[l] = v_w[l]/(1-np.power(beta2, l+1))
        v_b_hat[l] = v_b[l]/(1-np.power(beta2, l+1))

        #update parameters
        weights[l] -= eta*m_w_hat[l]/(np.sqrt(v_w_hat[l])+eps)
        biases[l] -= eta*m_b_hat[l]/(np.sqrt(v_b_hat[l])+eps)

      return weights, biases

    def nadam_gradient_descent(self, weights, biases, dW, dB, epochs, eta, eps, beta1, beta2):
      m_w = {}
      m_b = {}
      v_w = {}
      v_b = {}
      m_w_hat = {}
      m_b_hat = {}
      v_w_hat = {}
      v_b_hat = {}

      for l in range(1, self.num_hidden_layers + 1):
        m_w[l] = 0
        m_b[l] = 0
        v_w[l] = 0
        v_b[l] = 0
        m_w_hat[l] = 0
        m_b_hat[l] = 0
        v_w_hat[l] = 0
        v_b_hat[l] = 0

      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        m_w[l] = (beta1 * m_w[l]) + (1-beta1) * dW[l]
        m_b[l] = (beta1 * m_b[l]) + (1-beta1) * dB[l]

        v_w[l] = beta2 * v_w[l] + (1 - beta2) * (dW[l] ** 2)
        v_b[l] = beta2 * v_b[l] + (1 - beta2) * (dB[l] ** 2)

        m_w_hat[l] = m_w[l]/(1-np.power(beta1, l+1))
        m_b_hat[l] = m_b[l]/(1-np.power(beta1, l+1))
        v_w_hat[l] = v_w[l]/(1-np.power(beta2, l+1))
        v_b_hat[l] = v_b[l]/(1-np.power(beta2, l+1))

        #update parameters
        weights[l] -= (eta/np.sqrt(v_w_hat[l] + eps)) * (beta1 * m_w_hat[l] + (1-beta1) * dW[l] / (1-np.power(beta1, l+1)))
        biases[l] -= (eta/np.sqrt(v_b_hat[l] + eps)) * (beta1 * m_b_hat[l] + (1-beta1) * dB[l] / (1-np.power(beta1, l+1)))

      return weights, biases


    def compute_accuracy(self, X_test, y_test, weights, biases, num_hidden_layers, activation_function):

      _, _, pred_output = self.feedforward_propagation(X_test, weights, biases, num_hidden_layers, activation_function)
      pred_labels = np.argmax(pred_output, axis=0)
      accuracy = np.mean(pred_labels == y_test)
      return accuracy

In [16]:
def main():
  fashion_mnist = keras.datasets.fashion_mnist
  (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
  x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=10000, random_state=42)
  classes = {0:"T-shirt/top", 1:"Trouser", 2:"Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle Boot"}
  x_train_norm = x_train / 255
  x_test_norm = x_test / 255
  x_val_norm = x_val / 255

  # Define hyperparameters
  num_of_pixels = 28 * 28                                                         #28 * 28 = 784 pixels
  num_hidden_layers = 3
  num_hidden_neurons = 128
  hidden_neurons_list = []
  for i in range(num_hidden_layers):
    hidden_neurons_list.append(num_hidden_neurons)
  output_neurons = 10
  eta = 1e-3
  epochs = 10
  activation_function = "tanh"
  initialization = "normal"
  opt_function = "simple"
  batch_size = 32
  beta = 0.5
  beta1 = 0.9
  beta2 = 0.999
  eps = 1e-8

  #Taking pixels as inputs
  x_train_input = x_train_norm.reshape(len(x_train_norm), num_of_pixels)                      #flattening the image into 1d array
  x_test_input = x_test_norm.reshape(len(x_test_norm), num_of_pixels)                         #same thing
  x_val_reshape = x_val_norm.reshape(len(x_val_norm), num_of_pixels)
  x_train_input = x_train_input.T
  x_test_input = x_test_input.T
  x_val = x_val_reshape.T

  data_size = len(x_train_input[0])
  #print(x_train_input.shape)
  #print(data_size)


  nn = NeuralNetwork(num_of_pixels, hidden_neurons_list, num_hidden_layers, output_neurons)

  weights, biases = nn.initialize_parameters(num_of_pixels, hidden_neurons_list, num_hidden_layers, output_neurons, initialization)

  for iter in tqdm(range(epochs)):
    for i in range(0, data_size, batch_size):
      if i<= data_size - batch_size:
        X_batch = x_train_input[:, i:i+batch_size]
        Y_batch = y_train[i:i+batch_size]

        if opt_function == "simple":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)
          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.gradient_descent(weights, biases, dW, dB, eta)

        elif opt_function == "momentum":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)
          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.momentum_based_gradient_descent(weights, biases, dW, dB, epochs, eta, beta)

        elif opt_function == "nesterov":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)
          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.nesterov_accelerated_gradient_descent(weights, biases, dW, dB, epochs, eta, beta)
        elif opt_function == "rmsProp":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)
          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.rmsProp_gradient_descent(weights, biases, dW, dB, epochs, eta, eps, beta)
        elif opt_function == "adam":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)
          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.adam_gradient_descent(weights, biases, dW, dB, epochs, eta, eps, beta1, beta2)
        elif opt_function == "nadam":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)
          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.nadam_gradient_descent(weights, biases, dW, dB, epochs, eta, eps, beta1, beta2)

    accuracy = nn.compute_accuracy(x_val, y_val, weights, biases, num_hidden_layers, activation_function)
    print(f"Accuracy on Validation set: {accuracy * 100:.2f}%")

if __name__ == "__main__":
    main()

 10%|█         | 1/10 [00:07<01:05,  7.24s/it]

Accuracy on Validation set: 77.31%


 20%|██        | 2/10 [00:12<00:49,  6.17s/it]

Accuracy on Validation set: 79.48%


 30%|███       | 3/10 [00:19<00:45,  6.57s/it]

Accuracy on Validation set: 80.18%


 40%|████      | 4/10 [00:25<00:36,  6.08s/it]

Accuracy on Validation set: 80.76%


 50%|█████     | 5/10 [00:32<00:32,  6.43s/it]

Accuracy on Validation set: 81.28%


 60%|██████    | 6/10 [00:37<00:24,  6.14s/it]

Accuracy on Validation set: 81.53%


 70%|███████   | 7/10 [00:47<00:22,  7.43s/it]

Accuracy on Validation set: 81.74%


 80%|████████  | 8/10 [00:55<00:15,  7.55s/it]

Accuracy on Validation set: 82.05%


 90%|█████████ | 9/10 [01:00<00:06,  6.87s/it]

Accuracy on Validation set: 82.16%


100%|██████████| 10/10 [01:07<00:00,  6.71s/it]

Accuracy on Validation set: 82.23%



