<a href="https://colab.research.google.com/github/GauravKanwat/DL-CS6910/blob/main/DL_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install wandb

In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
import matplotlib.pyplot as plt
from time import sleep
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import wandb

import sys
sys.path.append('Assignment_1')
import hyperparameter_config
wandb.login(key="Your-API-Key")

In [None]:
def printImages(x_train, y_train):
  classes = {0:"T-shirt/top", 1:"Trouser", 2:"Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle Boot"}
  index = [list(y_train).index(i) for i in range(len(classes))]

  # image --> An image in a class; labels --> label
  images = []
  labels = []
  for i in index:
    images.append(x_train[i])
    labels.append(classes[y_train[i]])
  wandb.log({"Images": [wandb.Image(image, caption=caption) for image, caption in zip(images, labels)]}, step=i)

In [3]:
import numpy as np
from tqdm import tqdm
import wandb

class NeuralNetwork:
    def __init__(self, num_of_pixels, hidden_neurons_list, num_hidden_layers, output_neurons):
      self.num_of_pixels = num_of_pixels
      self.hidden_neurons_list = hidden_neurons_list
      self.num_hidden_layers = num_hidden_layers
      self.output_neurons = output_neurons


    def initialize_parameters(self, num_of_pixels, hidden_neurons_list, num_hidden_layers, output_neurons, initialization):

      weights = {}
      biases = {}
      prev_weights = {}
      prev_biases = {}

      # Xavier initialization
      if initialization == "Xavier":
        weights[0] = np.random.randn(hidden_neurons_list[0], num_of_pixels) * np.sqrt(1 / num_of_pixels)
        biases[0] = np.zeros((hidden_neurons_list[0], 1))

        # Initialize weights and biases for hidden layers
        for l in range(1, len(hidden_neurons_list)):
          weights[l] = np.random.randn(hidden_neurons_list[l], hidden_neurons_list[l-1]) * np.sqrt(1 / hidden_neurons_list[l-1])
          biases[l] = np.zeros((hidden_neurons_list[l], 1))

        # Initialize weights for last hidden layer to output layer
        weights[len(hidden_neurons_list)] = np.random.randn(output_neurons, hidden_neurons_list[-1]) * np.sqrt(1 / hidden_neurons_list[-1])
        biases[len(hidden_neurons_list)] = np.zeros((output_neurons, 1))

        # Initialize previous weights and biases
        for l in range(num_hidden_layers + 1):
          prev_weights[l] = np.zeros_like(weights[l])
          prev_biases[l] = np.zeros_like(biases[l])

        return weights, biases, prev_weights, prev_biases

      #random initialization
      else:
        weights[0] = np.random.rand(hidden_neurons_list[0], num_of_pixels) - 0.5
        biases[0] = np.random.rand(hidden_neurons_list[0], 1) - 0.5
        for l in range(num_hidden_layers):
          weights[l] = np.random.rand(hidden_neurons_list[l], num_of_pixels if l == 0 else hidden_neurons_list[l-1]) - 0.5
          biases[l] = np.random.rand(hidden_neurons_list[l], 1) - 0.5
        weights[num_hidden_layers] = np.random.rand(output_neurons, hidden_neurons_list[-1]) - 0.5
        biases[num_hidden_layers] = np.random.rand(output_neurons, 1) - 0.5

        for l in range(num_hidden_layers + 1):
          prev_weights[l] = np.zeros_like(weights[l])
          prev_biases[l] = np.zeros_like(biases[l])
      return weights, biases, prev_weights, prev_biases
      '''
          Initializing the weights and biases, both are dictionary which are storing random values generated by rand between (0 to 1) and subtracting 0.5 from it makes it between
          -0.5 to 0.5
      '''

    def sigmoid(self, x):
      sigmoid_x = np.where(x < -30, 1, 1 / (1 + np.exp(-x)))
      return sigmoid_x

    def reLU(self, Z):
        return np.maximum(0, Z)

    def tanh(self, x):
      return np.tanh(x)

    def identity(self, x):
      return x

    def softmax(self, x):

        max_x = np.max(x, axis=0)

        # avoiding overflow
        exp_x = np.exp(x - max_x)
        return exp_x / np.sum(exp_x, axis=0)


    def feedforward_propagation(self, X, weights, biases, num_hidden_layers, activation_function):
      a = []
      h = []

      for k in range(num_hidden_layers):

        if k == 0:

          a.append(np.dot(weights[k], X) + biases[k])
          if(activation_function == "reLU"):
            h.append(self.reLU(a[k]))
          elif(activation_function == "sigmoid"):
            h.append(self.sigmoid(a[k]))
          elif(activation_function == "tanh"):
            h.append(self.tanh(a[k]))
          elif(activation_function == "identity"):
            h.append(self.identity(a[k]))

        else:

          a.append(np.dot(weights[k], h[k-1]) + biases[k])
          if(activation_function == "reLU"):
            h.append(self.reLU(a[k]))
          elif(activation_function == "sigmoid"):
            h.append(self.sigmoid(a[k]))
          elif(activation_function == "tanh"):
            h.append(self.tanh(a[k]))
          elif(activation_function == "identity"):
            h.append(self.identity(a[k]))


      a.append(np.dot(weights[num_hidden_layers], h[num_hidden_layers - 1]) + biases[num_hidden_layers])
      y_hat = self.softmax(a[-1])
      return a, h, y_hat

    def one_hot(self, Y):
      if Y.max() != 9:
        one_hot_Y = np.zeros((Y.size, 10))
      else:
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
      one_hot_Y[np.arange(Y.size), Y] = 1
      one_hot_Y = one_hot_Y.T
      return one_hot_Y

    def deriv_sigmoid(self, Z):
      func = self.sigmoid(Z)
      return func * (1 - func)

    def deriv_reLU(self, Z):
      return Z > 0

    def deriv_tanh(self, x):
      return 1 - np.tanh(x)**2

    def deriv_identity(self, x):
      return 1

    def back_propagation(self, Y, fwd_A, fwd_H, weights, biases, pred_output, num_hidden_layers, activation_function):
      one_hot_Y = self.one_hot(Y)
      dA = {}
      dH = {}
      dW = {}
      dB = {}

      dA[num_hidden_layers] = pred_output - one_hot_Y

      for k in range(num_hidden_layers, 0, -1):
        dW[k] = np.dot(dA[k], fwd_H[k-1].T)
        dB[k] = np.mean(dA[k], axis=1, keepdims=True)

        dH[k-1] = np.dot(weights[k].T, dA[k])
        if(activation_function == "reLU"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_reLU(fwd_A[k-1]))
        elif(activation_function == "sigmoid"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_sigmoid(fwd_A[k-1]))
        elif(activation_function == "tanh"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_tanh(fwd_A[k-1]))
        elif(activation_function == "identity"):
          dA[k-1] = np.multiply(dH[k-1], self.deriv_identity(fwd_A[k-1]))
      return dW, dB

    def get_predictions(self, pred_output):
      return np.argmax(pred_output, axis = 0)

    def get_accuracy(self, y_pred, y_true):
      return np.sum(y_pred == y_true) / y_true.size


    def loss_function(self, y_pred, y_true, loss, weights, weight_decay):

      #Cross Entropy
      if(loss == 'cross_entropy'):
        epsilon = 1e-30
        cross_entropy_loss = -np.mean(np.sum(y_true * np.log(y_pred + epsilon), axis=0))

        # L2 Regularisation
        reg_loss = 0.5 * weight_decay * sum(np.sum(w ** 2) for w in weights.values())
        total_loss = cross_entropy_loss + reg_loss

      #Mean Squared Error
      elif(loss == 'mse'):
        mse_loss = np.mean(np.sum((y_pred - y_true) ** 2))

        # L2 Regularisation
        reg_loss = 0.5 * weight_decay * sum(np.sum(w ** 2) for w in weights.values())
        total_loss = mse_loss + reg_loss
      return total_loss


    # Gradient descent and Optimizers

    def gradient_descent(self, weights, biases, dW, dB, eta):

      # Update weights and biases
      for l in range(1, self.num_hidden_layers + 1):
        weights[l] -= eta * dW[l]
        biases[l] -= eta * dB[l]

      return weights, biases


    def momentum_based_gradient_descent(self, weights, biases, prev_weights, prev_biases, dW, dB, eta, momentum):

      for l in range(1, self.num_hidden_layers + 1):
        uw = momentum * prev_weights[l] + eta * dW[l]
        ub = momentum * prev_biases[l] + eta * dB[l]

        # Update current and prev weights and biases
        weights[l] -= uw
        biases[l] -= ub
        prev_weights[l] = uw
        prev_biases[l] = ub
      return weights, biases, prev_weights, prev_biases


    def adagrad_gradient_descent(self, weights, biases, dW, dB, eta, eps):

      v_w = {}
      v_b = {}

      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = 0
        v_b[l] = 0

      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = v_w[l] + dW[l]**2
        v_b[l] = v_b[l] + dB[l]**2

        # Update weights and biases
        weights[l] -= eta * dW[l] / (np.sqrt(v_w[l]) + eps)
        biases[l] -= eta * dB[l] / (np.sqrt(v_b[l]) + eps)

      return weights, biases


    def rmsProp_gradient_descent(self, weights, biases, dW, dB, eta, eps, beta):

      v_w = {}
      v_b = {}

      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = 0
        v_b[l] = 0

      for l in range(1, self.num_hidden_layers + 1):
        v_w[l] = (beta * v_w[l]) + ((1-beta) * dW[l] ** 2)
        v_b[l] = (beta * v_b[l]) + ((1-beta) * dB[l] ** 2)

        # Update weights and biases
        weights[l] -= eta * dW[l] / (np.sqrt(v_w[l]) + eps)
        biases[l] -= eta * dB[l] / (np.sqrt(v_b[l]) + eps)

      return weights, biases


    def adam_gradient_descent(self, weights, biases, ts, v_w, v_b, m_w, m_b, dW, dB, eta, eps, beta1, beta2):

      for l in range(1, self.num_hidden_layers + 1):
        mdW = (beta1 * m_w[l]) + (1-beta1) * dW[l]
        mdB = (beta1 * m_b[l]) + (1-beta1) * dB[l]

        vdW = beta2 * v_w[l] + (1 - beta2) * (dW[l] ** 2)
        vdB = beta2 * v_b[l] + (1 - beta2) * (dB[l] ** 2)

        m_w_hat = mdW/(1-np.power(beta1, ts))
        v_w_hat = vdW/(1-np.power(beta2, ts))
        m_b_hat = mdB/(1-np.power(beta1, ts))
        v_b_hat = vdB/(1-np.power(beta2, ts))

        #update weights and biases
        weights[l] -= eta*m_w_hat/(np.sqrt(v_w_hat+eps))
        biases[l] -= eta*m_b_hat/(np.sqrt(v_b_hat+eps))

        v_w[l] = vdW
        v_b[l] = vdB
        m_w[l] = mdW
        m_b[l] = mdB

      ts += 1

      return weights, biases, v_w, v_b, m_w, m_b, ts

    # <---------------------------------------------START--------------------------------------------------->


    ''' Add new optimizers here '''





    # <---------------------------------------------END----------------------------------------------------->



    def compute_accuracy(self, X_test, y_test, weights, biases, num_hidden_layers, activation_function):

      _, _, pred_output = self.feedforward_propagation(X_test, weights, biases, num_hidden_layers, activation_function)
      pred_labels = np.argmax(pred_output, axis=0)
      accuracy = np.mean(pred_labels == y_test)
      return accuracy

In [None]:
def train_neural_network(nn, x_train_input, y_train, x_test_input, y_test, x_val, y_val, weights, biases, prev_weights, prev_biases, num_hidden_layers, activation_function, optimizer, epochs, batch_size, eta, momentum, beta, beta1, beta2, eps, weight_decay, loss):

  data_size = len(x_train_input[0])

  if optimizer == "sgd":
    batch_size = 1

  lookahead_w = weights
  lookahead_b = biases
  ts = 1
  v_w = prev_weights.copy()
  v_b = prev_biases.copy()
  m_w = prev_weights.copy()
  m_b = prev_biases.copy()

  for iter in tqdm(range(epochs)):
    total_train_loss = 0
    for i in range(0, data_size, batch_size):
      if i<= data_size - batch_size:
        X_batch = x_train_input[:, i:i+batch_size]
        Y_batch = y_train[i:i+batch_size]

        if optimizer == "sgd":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)

          one_hot_Y = nn.one_hot(Y_batch)
          train_loss = nn.loss_function(pred_output, one_hot_Y, loss, weights, weight_decay)
          total_train_loss += train_loss

          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.gradient_descent(weights, biases, dW, dB, eta)

        elif optimizer == "momentum":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)

          one_hot_Y = nn.one_hot(Y_batch)
          train_loss = nn.loss_function(pred_output, one_hot_Y, loss, weights, weight_decay)
          total_train_loss += train_loss

          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases, _, _ = nn.momentum_based_gradient_descent(weights, biases, prev_weights, prev_biases, dW, dB, eta, momentum)

        elif optimizer == "nesterov":

          # Partial updates
          for l in range(1, num_hidden_layers+1):
            lookahead_w[l] = weights[l] - beta * prev_weights[l]
            lookahead_b[l] = biases[l] - beta * prev_biases[l]

          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, lookahead_w, lookahead_b, num_hidden_layers, activation_function)

          one_hot_Y = nn.one_hot(Y_batch)
          train_loss = nn.loss_function(pred_output, one_hot_Y, loss, weights, weight_decay)
          total_train_loss += train_loss

          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, lookahead_w, lookahead_b, pred_output, num_hidden_layers, activation_function)
          weights, biases, prev_weights, prev_biases = nn.momentum_based_gradient_descent(weights, biases, prev_weights, prev_biases, dW, dB, epochs, eta, beta)

        elif optimizer == "rmsProp":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)

          one_hot_Y = nn.one_hot(Y_batch)
          train_loss = nn.loss_function(pred_output, one_hot_Y, loss, weights, weight_decay)
          total_train_loss += train_loss

          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases = nn.rmsProp_gradient_descent(weights, biases, dW, dB, eta, eps, beta)

        elif optimizer == "adam":
          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, weights, biases, num_hidden_layers, activation_function)

          one_hot_Y = nn.one_hot(Y_batch)
          train_loss = nn.loss_function(pred_output, one_hot_Y, loss, weights, weight_decay)
          total_train_loss += train_loss

          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, weights, biases, pred_output, num_hidden_layers, activation_function)
          weights, biases, v_w, v_b, m_w, m_b, ts = nn.adam_gradient_descent(weights, biases, ts, v_w, v_b, m_w, m_b, dW, dB, eta, eps, beta1, beta2)

        elif optimizer == "nadam":

          # Partial updates
          for l in range(1, num_hidden_layers+1):
            lookahead_w[l] = weights[l] - beta * prev_weights[l]
            lookahead_b[l] = biases[l] - beta * prev_biases[l]

          fwd_a, fwd_h, pred_output = nn.feedforward_propagation(X_batch, lookahead_w, lookahead_b, num_hidden_layers, activation_function)

          one_hot_Y = nn.one_hot(Y_batch)
          train_loss = nn.loss_function(pred_output, one_hot_Y, loss, weights, weight_decay)
          total_train_loss += train_loss

          dW, dB = nn.back_propagation(Y_batch, fwd_a, fwd_h, lookahead_w, lookahead_b, pred_output, num_hidden_layers, activation_function)
          weights, biases, v_w, v_b, m_w, m_b, ts = nn.adam_gradient_descent(weights, biases, ts, v_w, v_b, m_w, m_b, dW, dB, eta, eps, beta1, beta2)

    # <---------------------------------------START---------------------------------------------->


    ''' Call new optimizer here '''






    # <---------------------------------------END------------------------------------------------>

    avg_train_loss = total_train_loss / (data_size / batch_size)

    _, _, val_pred = nn.feedforward_propagation(x_val, weights, biases, num_hidden_layers, activation_function)
    val_one_hot = nn.one_hot(y_val)
    val_loss = nn.loss_function(val_pred, val_one_hot, loss, weights, weight_decay)
    if(loss == 'mse'):
      val_loss = val_loss / (data_size / batch_size)

    _, _, test_pred = nn.feedforward_propagation(x_test_input, weights, biases, num_hidden_layers, activation_function)
    test_one_hot = nn.one_hot(y_test)
    test_loss = nn.loss_function(test_pred, test_one_hot, loss, weights, weight_decay)
    if(loss == 'mse'):
      test_loss = test_loss / (data_size / batch_size)

    val_accuracy = nn.compute_accuracy(x_val, y_val, weights, biases, num_hidden_layers, activation_function)
    train_accuracy = nn.compute_accuracy(x_train_input, y_train, weights, biases, num_hidden_layers, activation_function)
    test_accuracy = nn.compute_accuracy(x_test_input, y_test, weights, biases, num_hidden_layers, activation_function)

    print(f"val accuracy: {val_accuracy * 100:.2f}%, Test accuracy : {test_accuracy * 100:.2f}, Val loss: {val_loss:.4f}, Test Loss: {test_loss:.4f}")
    wandb.log({'val_accuracy' : val_accuracy * 100, 'accuracy' : train_accuracy * 100, 'test_accuracy' : test_accuracy * 100,
               'loss' : avg_train_loss, 'val loss' : val_loss, 'test_loss' : test_loss, 'epoch' : iter}, step=iter)

  return weights, biases

In [None]:
import argparse

def configParse():
    parser = argparse.ArgumentParser(description='Train neural network with specified parameters.')
    parser.add_argument('--wandb_project', type = str, default = 'Testing', help = 'project name')
    parser.add_argument('--wandb_entity', type = str, default='Test Accuracy', help = 'wandb entity')
    parser.add_argument('--dataset', type = str, default = 'fashion_mnist', help = 'dataset')
    parser.add_argument('--epochs', type = int, default = 10, help='epochs')
    parser.add_argument('--batch_size', type = int, default = 64, help='batch size')
    parser.add_argument('--loss', type=str, default = 'cross_entropy', help='loss function')
    parser.add_argument('--optimizer', type=str, default = 'nadam', help='optimizer')
    parser.add_argument('--learning_rate', type=float, default = 1e-3, help='learning rate')
    parser.add_argument('--momentum', type=float, default = 0.9, help='Momentum')
    parser.add_argument('--beta', type=float, default = 0.9, help='beta')
    parser.add_argument('--beta1', type=float, default = 0.9, help='beta1')
    parser.add_argument('--beta2', type=float, default = 0.999, help='beta2')
    parser.add_argument('--epsilon', type=float, default = 1e-8, help='epsilon')
    parser.add_argument('--weight_decay', type=float, default = 0.0, help='weight decay')
    parser.add_argument('--weight_init', type=str, default = "Xavier", help='weight initialization')
    parser.add_argument('--num_layers', type=int, default = 3, help='number of hidden layers')
    parser.add_argument('--hidden_size', type=int, default = 128, help='size of a hidden layer')
    parser.add_argument('--activation', type=str, default = "tanh", help='activation function')
    args = parser.parse_args()

    return args

In [None]:
from neural_network import NeuralNetwork, train_neural_network

def printImages(x_train, y_train):
  classes = {0:"T-shirt/top", 1:"Trouser", 2:"Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle Boot"}
  index = [list(y_train).index(i) for i in range(len(classes))]

  # image --> An image in a class; labels --> label
  images = []
  labels = []
  for i in index:
    images.append(x_train[i])
    labels.append(classes[y_train[i]])
  wandb.log({"Images": [wandb.Image(image, caption=caption) for image, caption in zip(images, labels)]}, step=i)

def main(args):

  # Taking dataset according to parameters passed by user
  if(args.dataset == "fashion_mnist"):
    fashion_mnist = keras.datasets.fashion_mnist
    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
  elif(args.dataset == "mnist"):
    mnist = keras.datasets.mnist
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

  # Train test split using sklearn library
  x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=10000, random_state=42)

  #Labels of the dataset
  class_names = []
  if(args.dataset == "fashion_mnist"):
    class_names=["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle Boot"]
  elif(args.dataset == "mnist"):
    class_names = ["Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine"]

  # Normalizing the pixels to avoid overflow
  x_train_norm = x_train / 255
  x_test_norm = x_test / 255
  x_val_norm = x_val / 255

  # 28 * 28 = 784 pixels
  num_of_pixels = 28 * 28

  output_neurons = 10

  #Taking pixels as inputs
  x_train_input = x_train_norm.reshape(len(x_train_norm), num_of_pixels)                      #flattening the image into 1d array
  x_test_input = x_test_norm.reshape(len(x_test_norm), num_of_pixels)                         #same thing
  x_val_reshape = x_val_norm.reshape(len(x_val_norm), num_of_pixels)

  # Taking transpose of the dataset, so it becomes 784 x 50000 meaning each column represents an image
  x_train_input = x_train_input.T
  x_test_input = x_test_input.T
  x_val = x_val_reshape.T


  # Define hyperparameters
  sweep_config = {
     'method' : 'random',
     'project' : args.wandb_project,
     'name' : 'Test Accuracy and Confusion Matrix',
     'entity' : args.wandb_entity,
     'metric' : {
        'name' : 'val_accuracy',
        'goal' : 'maximize',
     },
     'parameters' : {
        'eta' : {
           'values' : [args.learning_rate]
        },
        'epochs' : {
           'values' : [args.epochs]
        },
        'num_hidden_layers' : {
           'values' : [args.num_layers]
        },
        'num_hidden_neurons' : {
           'values' : [args.hidden_size]
        },
        'activation_function' : {
           'values' : [args.activation]
        },
        'initialization' : {
           'values' : [args.weight_init]
        },
        'optimizer' : {
           'values' : [args.optimizer]
        },
        'batch_size' : {
           'values' : [args.batch_size]
        },
        'momentum' : {
           'values' : [args.momentum]
        },
        'beta' : {
           'values' : [args.beta]
        },
        'beta1' : {
           'values' : [args.beta1]
        },
        'beta2' : {
           'values' : [args.beta2]
        },
        'eps' : {
           'values' : [args.epsilon]
        },
        'weight_decay' : {
           'values' : [args.weight_decay]
        },
        'loss' : {
           'values' : [args.loss]
        }
     }
  }


  run_name = ""

  def train():
    with wandb.init(project = args.wandb_project, entity = args.wandb_entity) as run:

      # Creates names of runs based on parameters. Example => hl_4_bs_64_ac_reLU
      config = wandb.config
      run_name = "hl_" + str(config.num_hidden_layers) + "_bs_" + str(config.batch_size) + "_ac_" + config.activation_function
      wandb.run.name = run_name

    #   printImages(x_train, y_train)           ---> run when want to print the images


      # creating the list of hidden_neurons
      hidden_neurons_list = []
      for i in range(config.num_hidden_layers):
        hidden_neurons_list.append(config.num_hidden_neurons)

      # Creates an object of class NeuralNetwork and Initializes the parameters
      nn = NeuralNetwork(num_of_pixels, hidden_neurons_list, config.num_hidden_layers, output_neurons)
      weights, biases, prev_weights, prev_biases = nn.initialize_parameters(num_of_pixels, hidden_neurons_list, config.num_hidden_layers, output_neurons, config.initialization)

      # Train the network
      weights, biases = train_neural_network(nn, x_train_input, y_train, x_test_input, y_test, x_val, y_val, weights, biases, prev_weights, prev_biases, config.num_hidden_layers, config.activation_function,
                                             config.optimizer, config.epochs, config.batch_size, config.eta, config.momentum, config.beta, config.beta1, config.beta2, config.eps, config.weight_decay, config.loss)


      # Print the confusion matrix
      _, _, y_test_pred = nn.feedforward_propagation(x_test_input, weights, biases, config.num_hidden_layers, config.activation_function)
      y_test_pred = np.argmax(y_test_pred, axis=0)
      conf_matrix = wandb.plot.confusion_matrix(y_true = y_test, preds = y_test_pred, class_names = class_names)
      wandb.sklearn.plot_confusion_matrix(y_test, y_test_pred, class_names)


  sweep_id = wandb.sweep(sweep=sweep_config)
  wandb.agent(sweep_id, function=train,count=1)
  wandb.finish()

if __name__ == "__main__":
    args = hyperparameter_config.configParse()
    main(args)
