### **Importing Libraries**

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.datasets import fashion_mnist
from sklearn.model_selection  import train_test_split
import itertools
import math
!pip install wandb
import wandb
%matplotlib inline

In [None]:
wandb.login()

### **`Loading and processing data`**

In [None]:
(train_images, train_labels),(test_images, test_labels) = fashion_mnist.load_data()

In [None]:
train_images, val_images, train_labels, val_labels  = train_test_split(train_images,train_labels,test_size=0.1,random_state = 42)

In [None]:
traind = train_images.reshape(train_images.shape[0],-1)
mean = traind.mean(axis=0)
centerd = traind -  mean
max = centerd.max(axis=0)
normalized = centerd/np.max(max)


val_images = val_images.reshape(val_images.shape[0],-1)
mean = val_images.mean(axis=0)
centerd = val_images -  mean
max = centerd.max(axis=0)
val_images = centerd/np.max(max)


test_images = test_images.reshape(test_images.shape[0],-1)
mean = test_images.mean(axis=0)
centerd = test_images -  mean
max = centerd.max(axis=0)
test_images = centerd/np.max(max)

### **Defining Neural Network**

In [None]:
class NeuralNet:
    def __init__(self,Size_of_Input, Number_of_Neuron_each_Layer, Number_of_Layers, activation_function, typeOfInit, L2reg_const = 0):
        self.activation_function = activation_function
        self.Size_of_Input = Size_of_Input
        self.Number_of_Layers = Number_of_Layers
        self.Number_of_Neuron_each_Layer = Number_of_Neuron_each_Layer
        self.L2reg_const = L2reg_const
        self.W,self.b = self.initializer(typeOfInit)

    
    def initializer(self, init):        
        W = []
        b = []
        if init == 'random':
            W.append(np.random.randn(self.Number_of_Neuron_each_Layer[0], self.Size_of_Input))
            for i in range(1,self.Number_of_Layers):
                W.append(np.random.randn(self.Number_of_Neuron_each_Layer[i],self.Number_of_Neuron_each_Layer[i-1]))

            for i in range(self.Number_of_Layers):
                b.append(np.random.rand(self.Number_of_Neuron_each_Layer[i]))

        elif (init == 'xavier'):
            W.append(np.random.normal(0,math.sqrt(2/(self.Number_of_Neuron_each_Layer[0]+ self.Size_of_Input)), (self.Number_of_Neuron_each_Layer[0], self.Size_of_Input)))
            for i in range(1,self.Number_of_Layers):
                W.append(np.random.normal(0, math.sqrt(2/(self.Number_of_Neuron_each_Layer[i]+self.Number_of_Neuron_each_Layer[i-1])),(self.Number_of_Neuron_each_Layer[i],self.Number_of_Neuron_each_Layer[i-1])))

            for i in range(self.Number_of_Layers):
                b.append(np.random.rand(self.Number_of_Neuron_each_Layer[i]))
        return W,b


    def activation(self, Z):
        if self.activation_function == 'ReLU':
            return self.ReLU(Z)
        elif self.activation_function == 'tanh':
            return self.tanh(Z)
        elif self.activation_function == 'sigmoid':
            return self.sigmoid(Z)


    def activation_derivative(self,Z):
        if self.activation_function == 'ReLU':
            return self.ReLU_derivative(Z)
        elif self.activation_function == 'tanh':
            return self.tanh_derivative(Z)
        elif self.activation_function == 'sigmoid':
            return self.sigmoid_derivative(Z)

    def ReLU(self,Z):
        return np.maximum(0,Z)

    def ReLU_derivative(self,Z):
        return [1 if x>0 else 0 for x in Z]

    def tanh(self, Z):
        return np.array([((np.exp(x) - np.exp(-x))/((np.exp(x) + np.exp(-x)))) for x in Z])
                 
    def tanh_derivative(self, Z):
        return np.array(1 - self.tanh(Z)**2)
                 
    def sigmoid_derivative(self,Z):
        return self.sigmoid(Z)*(1-self.sigmoid(Z))

    def sigmoid(self,x):
        return np.where(x>=0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))
    
    def softmax_function(self,Z):
            Z = Z - Z.max()
            return (np.exp(Z)/np.sum(np.exp(Z),axis=0))

    def forward_propagation(self,Input):
        A = []
        H = []
        Input = np.array(Input)
        A.append(self.W[0].dot(Input) + self.b[0])
        for i in range(1, self.Number_of_Layers):
            H.append(self.activation(A[-1]))
            A.append(self.W[i].dot(H[-1]) + self.b[i])
        y_hat = self.softmax_function(A[-1])
        return A, H, y_hat

    def backward_propagation(self, A, H, y_hat, y, Input, loss_type):
        delA = []
        delH = []
        delW = []
        delb = []
        Input = np.array(Input)
        H.insert(0,Input)
        ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
        ey[y] = 1
        
        # delA and delH have reverse indexing

        if loss_type == 'squared_error':
            delA.append(np.array((y_hat - ey)*(y_hat - y_hat**2)))
        else:
            delA.append(np.array(-(ey - y_hat)))
        
        for i in range(self.Number_of_Layers-1,-1,-1):
            delW.insert(0,delA[-1].reshape(delA[-1].shape[0],1).dot(H[i].reshape(H[i].shape[0],1).T) + self.L2reg_const*self.W[i])
            delb.insert(0,delA[-1])
            delH.append(self.W[i].T.dot(delA[-1]))
            if i-1>=0:
                delA.append(np.multiply(delH[-1], self.activation_derivative(A[i-1])))
        return delW,delb
    
    
    def initialize(self, Size_of_Input,Number_of_Layers,Number_of_Neuron_each_Layer):
        W, b = [], []
        W.append(np.zeros((Number_of_Neuron_each_Layer[0], Size_of_Input)))
        for i in range(1,Number_of_Layers):
            W.append(np.zeros((Number_of_Neuron_each_Layer[i],Number_of_Neuron_each_Layer[i-1])))
        for i in range(Number_of_Layers):
            b.append(np.zeros(Number_of_Neuron_each_Layer[i]))            
        return W, b

    
    def optimize(self, X, Y, val_images,val_labels,optimizer, learning_rate, max_epochs,batch_size, loss_type):
        if optimizer == 'sgd':
          self.stochastic_gradient_descent(X, Y, val_images,val_labels, learning_rate, max_epochs, loss_type)
        elif optimizer == 'momentum':
          self.momentum_gradient_descent(X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type)
        elif optimizer == 'nag':
          self.nesterov_accelerated_gradient_descent(X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type)
        elif optimizer == 'rmsprop':
          self.rmsprop(X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type)
        elif optimizer == 'adam':
          self.adam(X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type)
        elif optimizer == 'nadam':
          self.nadam(X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type)


    def stochastic_gradient_descent(self,X, Y, val_images,val_labels, learning_rate, max_epochs, loss_type):
        for j in range(max_epochs):
            correct = 0
            error = 0
            delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
    
            for i in range(X.shape[0]):
                A,H,y_hat = self.fo
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)            
                if loss_type == "squared_error":
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
                else:
                    error += -math.log(y_hat[Y[i]]) + self.L2reg_const/2*sum(s)

                delW,delb = self.backward_propagation(A,H,y_hat,Y[i],X[i], loss_type)

                if(np.argmax(y_hat) == Y[i]):
                    correct +=1
                
                for i in range(self.Number_of_Layers):
                    self.W[i] = self.W[i] - learning_rate*delW[i]
                    self.b[i] = self.b[i] - learning_rate*delb[i]

            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            v_error, v_accruracy = self.val_loss_and_accuracy(val_images, val_labels, loss_type)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' : v_error,'valid_accuracy' : v_accruracy})


    def momentum_gradient_descent(self,X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type, gamma = 0.6):
        updateW, updateb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)

        for j in range(max_epochs):
            correct = 0
            error = 0

            delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)

            for i in range(X.shape[0]):
   ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)                s = [x.sum() for x in self.W]
                if loss_type == "squared_error":
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
                else:
                    error += -math.log(y_hat[Y[i]]) + self.L2reg_const/2*sum(s)

                w,b = self.backward_propagation(A,H,y_hat,Y[i],X[i], loss_type)

                for k in range(self.Number_of_Layers):
                    delW[k] += w[k]
                    delb[k] += b[k]

                for k in range(self.Number_of_Layers):
                    updateW[k] = gamma*updateW[k] + learning_rate*delW[k]   
                    updateb[k] = gamma*updateb[k] + learning_rate*delb[k]
                
                if  (i%batch_size == 0 and i!=0) or i==X.shape[0]-1:
                    delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
                    for k in range(self.Number_of_Layers):
                        self.W[k] += -updateW[k]  
                        self.b[k] += -updateb[k]

                if(np.argmax(y_hat) == Y[i]):
                    correct +=1

                
            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            v_error, v_accruracy = self.val_loss_and_accuracy(val_images, val_labels, loss_type)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' : v_error,'valid_accuracy' : v_accruracy})


    def nesterov_accelerated_gradient_descent(self, X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size,loss_type, gamma = 0.5):
        updateW, updateb = self.initialize( self.Size_of_Input, self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        lookaheadW, lookaheadb = self.initialize( self.Size_of_Input, self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        thetaW, thetab = self.initialize( self.Size_of_Input, self.Number_of_Layers,self.Number_of_Neuron_each_Layer)

        for j in range(max_epochs):
            correct = 0
            error = 0

            delW, delb = self.initialize( self.Size_of_Input, self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
                
            for k in range( self.Number_of_Layers):
                thetaW[k] = self.W[k]
                thetab[k] = self.b[k]

            for k in range( self.Number_of_Layers):
                lookaheadW[k] = thetaW[k] - gamma*updateW[k]    
                lookaheadb[k] = thetab[k] - gamma*updateb[k]
                self.W[k] = lookaheadW[k]
                self.b[k] = lookaheadb[k]

            
   ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)_hat = self.forward_propagation(X[i])
                
                s = [x.sum() for x in self.W]
                if loss_type == "squared_error":
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
                else:
                    error += -math.log(y_hat[Y[i]]) + self.L2reg_const/2*sum(s)

                w,b = self.backward_propagation(A,H,y_hat,Y[i],X[i], loss_type)

                for k in range( self.Number_of_Layers):
                    delW[k] += w[k]
                    delb[k] += b[k]

                for k in range( self.Number_of_Layers):
                    updateW[k] =  gamma*updateW[k] + learning_rate*delW[k]   
                    updateb[k] = gamma*updateb[k] + learning_rate*delb[k]

                
                if  (i%batch_size == 0 and i!=0) or i==X.shape[0]-1:
                    delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
                    for k in range(self.Number_of_Layers):
                        self.W[k] += -updateW[k]  
                        self.b[k] += -updateb[k]
                
                if(np.argmax(y_hat) == Y[i]):
                    correct +=1
            
            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            v_error, v_accruracy = self.val_loss_and_accuracy(val_images, val_labels, loss_type)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' : v_error,'valid_accuracy' : v_accruracy})



    def rmsprop(self,X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size,loss_type, beta = 0.89, epsilon = 1e-6):
        v_W, v_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)

        for j in range(max_epochs):
            correct = 0
            error = 0

            delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layersey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s) in range(X.shape[0]):
                A,H,y_hat = self.forward_propagation(X[i])
                
                s = [x.sum() for x in self.W]
                if loss_type == "squared_error":
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
                else:
                    error += -math.log(y_hat[Y[i]]) + self.L2reg_const/2*sum(s)

                w,b = self.backward_propagation(A,H,y_hat,Y[i],X[i], loss_type)

                for k in range(self.Number_of_Layers):
                    delW[k] += w[k]
                    delb[k] += b[k]
                
                if(np.argmax(y_hat) == Y[i]):
                    correct +=1

                for k in range(self.Number_of_Layers):
                    v_W[k] =  beta*v_W[k] + (1-beta)*delW[k]**2      
                    v_b[k] = beta*v_b[k] + (1-beta)*delb[k]**2
         
                
                if  (i%batch_size == 0 and i!=0) or i==X.shape[0]-1:
                    for k in range(self.Number_of_Layers):
                        v_W[k] =  beta*v_W[k] + (1-beta)*delW[k]**2      
                        v_b[k] = beta*v_b[k] + (1-beta)*delb[k]**2
                    for k in range(self.Number_of_Layers):
                        self.W[k] = self.W[k] - (learning_rate*delW[k])/np.sqrt(v_W[k] + epsilon)
                        self.b[k] = self.b[k] - (learning_rate*delb[k])/np.sqrt(v_b[k] + epsilon)
                    delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer) 
            
            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            v_error, v_accruracy = self.val_loss_and_accuracy(val_images, val_labels, loss_type)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' : v_error,'valid_accuracy' : v_accruracy})


    
    def adam(self,X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type, beta1 = 0.89,beta2 = 0.989,epsilon = 1e-8):
        m_W, m_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        m_hat_W, m_hat_b = self.initialize( self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        v_W, v_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        v_hat_W, v_hat_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        
        for j in range(max_epochs):
            correct = 0
            error = 0
            delW, delb = self.initializey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)euron_each_Layer)
            
            for i in range(X.shape[0]):
                A,H,y_hat = self.forward_propagation(X[i])
                
                s = [x.sum() for x in self.W]
                if loss_type == "squared_error":
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
                else:
                    error += -math.log(y_hat[Y[i]]) + self.L2reg_const/2*sum(s)

                w,b = self.backward_propagation(A,H,y_hat,Y[i],X[i], loss_type)

                for k in range(self.Number_of_Layers):
                    delW[k] += w[k]
                    delb[k] += b[k]
                
                if(np.argmax(y_hat) == Y[i]):
                    correct +=1

                if  (i%batch_size == 0 and i!=0) or i==X.shape[0]-1:
                    for k in range(self.Number_of_Layers):
                        v_W[k] =  beta2*v_W[k] + (1-beta2)*delW[k]*delW[k]
                        v_b[k] = beta2*v_b[k] + (1-beta2)*delb[k]*delb[k]
                        m_W[k] = beta1*m_W[k] + (1-beta1)*delW[k]
                        m_b[k] = beta1*m_b[k] + (1-beta1)*delb[k]
                        m_hat_W[k] = m_W[k]/(math.pow(beta1,j))
                        m_hat_b[k] = m_b[k]/(math.pow(beta1,j))
                        v_hat_W[k] = v_W[k]/(math.pow(beta2,j))
                        v_hat_b[k] = v_b[k]/(math.pow(beta2,j))
                    
                    for k in range(self.Number_of_Layers):
                        self.W[k] = self.W[k] - (learning_rate*m_hat_W[k])/np.sqrt(v_hat_W[k] + epsilon)
                        self.b[k] = self.b[k] - (learning_rate*m_hat_b[k])/np.sqrt(v_hat_b[k] + epsilon)
                    delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
                                
            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            v_error, v_accruracy = self.val_loss_and_accuracy(val_images, val_labels, loss_type)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' : v_error,'valid_accuracy' : v_accruracy})
    
    def nadam(self, X, Y, val_images,val_labels, learning_rate, max_epochs,batch_size, loss_type, beta1 = 0.89,beta2 = 0.989,epsilon = 1e-8):
        m_W, m_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        m_hat_W, m_hat_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        v_W, v_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        v_hat_W, v_hat_b = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        
        for j in range(max_epochs):
            correct = 0
         ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)lf.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
        
            for i in range(X.shape[0]):
                A,H,y_hat = self.forward_propagation(X[i])
                
                s = [x.sum() for x in self.W]
                if loss_type == "squared_error":
                    ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                    ey[Y[i]] = 1
                    error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
                else:
                    error += -math.log(y_hat[Y[i]]) + self.L2reg_const/2*sum(s)

                w,b = self.backward_propagation(A,H,y_hat,Y[i],X[i], loss_type)

                for k in range(self.Number_of_Layers):
                    delW[k] += w[k]
                    delb[k] += b[k]
                
                if(np.argmax(y_hat) == Y[i]):
                    correct +=1

                if  (i%batch_size == 0 and i!=0) or i==X.shape[0]-1:
                    for k in range(self.Number_of_Layers):
                        v_W[k] =  beta2*v_W[k] + (1-beta2)*delW[k]**2
                        v_b[k] = beta2*v_b[k] + (1-beta2)*delb[k]**2
                        m_W[k] = beta1*m_W[k] + (1-beta1)*delW[k]
                        m_b[k] = beta1*m_b[k] + (1-beta1)*delb[k]
                        m_hat_W[k] = m_W[k]/(math.pow(beta1,j))
                        m_hat_b[k] = m_b[k]/(math.pow(beta1,j))
                        v_hat_W[k] = v_W[k]/(math.pow(beta2,j))
                        v_hat_b[k] = v_b[k]/(math.pow(beta2,j))
                    for k in range(self.Number_of_Layers):
                        self.W[k] = self.W[k] - (learning_rate*(beta1*m_hat_W[k] + (1-beta1)*delW[k]/(1-beta1)))/np.sqrt(v_hat_W[k] + epsilon)
                        self.b[k] = self.b[k] - (learning_rate*(beta1*m_hat_b[k] + (1-beta1)*delb[k]/(1-beta1)))/np.sqrt(v_hat_b[k] + epsilon)
                    delW, delb = self.initialize(self.Size_of_Input,self.Number_of_Layers,self.Number_of_Neuron_each_Layer)
                    
            error /= X.shape[0]
            accuracy = correct/X.shape[0]*100
            v_error, v_accruracy = self.val_loss_and_accuracy(val_images, val_labels, loss_type)
            wandb.log({'epoch' : j, 'train_loss' : error, 'train_accuracy' : accuracy,'valid_loss' 
                ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                ey[val_labels[i]] = 1
                accuracy' : v0.5*np.sum((ey-y_hat)**2)   def val_loss_and_accuracy(self,val_data,val_labels, loss_type):
        val_correct = 0
        val_error = 0
        val_loss = []
        val_accuracy = []
        for i in range(val_data.shape[0]):
            A,H,y_hat = self.forward_propagation(val_data[i])
            s = [x.sum() for x in self.W]
            if loss_type == "squared_error":
                ey = np.zeros(self.Number_of_Neuron_each_Layer[-1])
                ey[val_labels[i]] = 1
                val_error += 0.5*np.sum((ey-y_hat)**2) + self.L2reg_const/2*sum(s)
            else:
                val_error += -math.log(y_hat[val_labels[i]]) + self.L2reg_const/2*sum(s)

            if np.argmax(y_hat) == val_labels[i]:
                val_correct += 1
        return val_error/val_data.shape[0], val_correct/val_data.shape[0]*100


    def test(self,test_data,test_labels):
        correct = 0
        y_hat = []
        for i in range(test_data.shape[0]):
            A,H,y = self.forward_propagation(test_data[i])
            if np.argmax(y_hat) == test_labels[i]:
                correct += 1
            y_hat.append(y)
        return np.argmax(np.array(y_hat),axis=1), correct/test_data.shape[0]*100

### **Sweep (Hyperparameter tuning)**

In [None]:
sweep_config = {
    'method': 'grid',
    'metric': {'goal': 'maximize', 'name': 'valid_accuracy'},
    'parameters': {'activation_function': {'value': 'tanh'},
                'batch_size': {'values': [32, 64]},
                'epochs': {'value': 10},
                'hidden_layer_size': {'value': 32},
                'num_of_hidden_layers': { 'value' : 1 },
                'learning_rate': {'values': [0.005, 0.0006]},
                'optimizer': {'values': ['momentum', 'adam']},
                'weight_decay': {'value': 0},
                'weight_initialization': {'value': 'xavier'},
                'loss_type': {'values': ['squared_error', 'cross_entropy']}}}

In [None]:
def train():
    var1 = wandb.init()
    var2 = var1.config
    obj = NeuralNet(normalized.shape[1], list(itertools.chain(*[[var2.hidden_layer_size]*var2.num_of_hidden_layers, [10]])), var2.num_of_hidden_layers+1, var2.activation_function, var2.weight_initialization, var2.weight_decay)
    obj.optimize(normalized, train_labels, val_images, val_labels, var2.optimizer, var2.learning_rate, var2.epochs, var2.batch_size, var2.loss_type)

In [None]:
sweep_id = wandb.sweep(sweep_config, project='CS6910 assignment1')

In [None]:
wandb.agent(sweep_id, train)