# Biologically Inspired Computation (F20BC/F21BC), Coursework I 

## Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import math
import time

np.random.seed(3) # seed is set so that the random values give similar values at each run.

## Getting a Glimpse of the Data

In [2]:
df = pd.read_csv("data.csv") # reading the data from the csv file

In [3]:
df.columns # to see the columns in the dataframe

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [4]:
df.head() # to see the first five rows/instances/samples

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
df.info() # to see detailed information about the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [6]:
df.duplicated().sum() # to check for duplicate rows

0

In [7]:
df.isna().sum() # to see the number of null values in each of the columns

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

#### From the above output, we can infer that there are no duplicates and null values in the data (except for the Unnamed column)

In [8]:
df['diagnosis'].value_counts() # to see the different values of the 'diagnosis' column

B    357
M    212
Name: diagnosis, dtype: int64

## Data Pre-processing

In [9]:
# droping the unnamed and id features in the same dataframe
df.drop(['Unnamed: 32', 'id'], axis=1, inplace=True)
X = df.drop('diagnosis', axis=1) # dropping the class/target attribute
y = df.diagnosis # y set is the diagnosis column (class/target attribute)

In [10]:
y = np.array(y.map(lambda x: 1 if x=='M' else 0)) # encoding all Ms to 1s and Bs to 0s
y = y.reshape(569,1) # to reshape y into a size compatible with the X set (number of rows=569, number of features=1)

In [11]:
# Splitting the X and y datasets into train and test sets for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
# normalizing the X training and testing sets with min-max where the minimum value of a feature is made a 0 and the maximum value is made a 1, all other value become floats betwween 0 and 1
X_train = (X_train - X_train.min())/(X_train.max() - X_train.min()) 
X_test = (X_test - X_test.min())/(X_test.max() - X_test.min())

### Dimensions of the final X and y training and testing datasets
The output of .shape gives the dimensions in the following order- (number of rows, number of columns)

In [13]:
X.shape

(569, 30)

In [14]:
y.shape

(569, 1)

In [15]:
X_train.shape

(426, 30)

In [16]:
X_test.shape

(143, 30)

In [17]:
y_train.shape

(426, 1)

In [18]:
y_test.shape

(143, 1)

## Building the Neural Network

### Initializing the values of the hyperparamters

In [19]:
n_inputs = X.shape[1] # the number of features
n_hidden_layers = 2 # default number of hidden layers
n_hidden = [] # list to contain the number of nodes in each hidden layer
activations = [] # list to contain the names of the activation functions selected for each layer
lr = 0.1 # learning rate 
decay_rate = 0 # decay rate of the learning rate
lr_decay = 'n' # if there should be a learning rate decay
loss = 'c' # default loss type as cross entropy loss
gradient_alg = 'b' # default gradient desecent algorithm as batch gradient descent
n_outputs = 1 # number of output nodes
dimensions = [] # to store the number of nodes in the input and all the layers

### Inputting the values of the hyperparamters

In [None]:
while True: # to loop until a valid number is given
    print("Please enter the number of input nodes/features (default = ",X.shape[1],")")
    n_inputs = int(input())
    if n_inputs <= X_train.shape[1]:
        break
    else:
        print("Try a smaller number less than or equal to " + X_train.shape[1])

# Taking the number of input features as requested
X_train = X_train.iloc[:,:n_inputs]
X_test = X_test.iloc[:,:n_inputs]

print("Please enter the number of hidden layers")
n_hidden_layers = int(input())

for i in range(n_hidden_layers):
    print("For layer " + str(i + 1) + ":")
    print(
        "Please enter the number of hidden neurons in this layer (same as the size of this hidden layer) "
    )
    n_hidden.append(int(input()))

for i in range(n_hidden_layers + 1):
    print("For layer " + str(i + 1) + ":")
    valid1 = False
    while (valid1 == False): # to loop until a valid function is given
        print(
            "Please choose an activation function (type one of- sigmoid/relu/tanh/softmax)"
        )
        inp_a = input()
        if (inp_a == "sigmoid" or inp_a == "relu" or inp_a == "tanh"):
            valid1 = True
            activations.append(inp_a)
            
        elif (inp_a == "softmax"):
            valid1 = True
            activations.append(inp_a)
            n_outputs = 2

        else:
            print("Try Again")

print("Please enter the number of epochs(1-20000)")
epochs = int(input())

while True: # to loop until a valid number is given
    print("Please enter a learning rate <= 1")
    lr = float(input())
    if lr <= 1:
        break

print("Should the learning rate decay with each layer? (type y/n)")
lr_decay = input()

valid2 = False
while (valid2 == False): # to loop until a valid function is given
    print(
        "Please choose a loss function (cross entropy loss (type c)/ hinge loss (type h)/ squared hinge loss(type s))"
    )
    loss = input()

    if (loss == "c" or loss == "h" or loss == "s"):
        valid2 = True

    else:
        print("Try Again")

valid3 = False
while (valid3 == False): # to loop until a valid algorithm is given
    print(
        "Please choose a gradient descent algorithm (batch gradient descent (type b)/ stochastic gradient descent (type s)/ and mini-batch gradient descent (type m))"
    )
    gradient_alg = input()

    # for mini batch gradient descent
    if (gradient_alg == "m"):
        valid3 = True
        print("Please enter the number of batches")
        batch_size = int(input())
    
    # for batch gradient descent
    elif (gradient_alg == "b"): # batch size of a mini batch would be the same as the number of data samples/rows
        valid3 = True
        batch_size = X.shape[0] #X_train

    # for stochatic gradient descent
    elif (gradient_alg == "s"): # batch size of a mini batch would be the 1 as costs are cmomputed for each of the samples
        valid3 = True
        batch_size = 1
        
    else:
        print("Try Again")
        

Please enter the number of input nodes/features (default =  30 )
30
Please enter the number of hidden layers
3
For layer 1:
Please enter the number of hidden neurons in this layer (same as the size of this hidden layer) 
21
For layer 2:
Please enter the number of hidden neurons in this layer (same as the size of this hidden layer) 


In [None]:
# to append the number of input festures into the dimensions list
dimensions.append(n_inputs)
# to add the number of nodes in each of the hidden layers from n_hidden into the dimensions list
dimensions.extend(n_hidden)
# to append the number of output nodes into the dimensions list
dimensions.append(n_outputs)

# to convert all elements in dimensions to numpy ints
dimensions = np.array(dimensions, dtype=np.int64)

### Intializing Parameters

In [None]:
def initialize_parameters(dimensions):
    
    """
    To initialize all the weights and biases
    
        Parameters:
            dimensions - list with the input features and number of nodes in all the layers of the network

        Returns:
            parameters - dictionary with the values of the weights and biases where 'W' is the prefix for weights and 'b' for the bias
    """
    
    parameters = {}
    for i in range (len(dimensions)-1):
        parameters["W"+str(i+1)] = np.random.randn(dimensions[i+1],dimensions[i]) / np.sqrt(dimensions[i]) # giving random values with the specified dimensions
        parameters["b"+str(i+1)] = np.zeros((dimensions[i+1],1)) # setting the bias to zero vector with the specified dimensions
    
    return parameters

### Forward Propagation

In [None]:
# Sigmoid function
def sigmoid(Z):
    """
    To compute the value of the activation or the output of the layer with sigmoid activation function.
    
        Parameters:
            Z - linear activation value- 

        Returns:
            
            A - node output
    """
    A = 1 / (1 + np.exp(-Z))
    return A

# ReLU (rectified linear unit)
def relu(Z):
    """
    To compute the value of the activation or the output of the layer with ReLU activation function.
    
        Parameters:
            Z - linear activation value- 

        Returns:
            
            A - node output
    """
    A = np.maximum(0, Z)
    return A

# Hyperbolic tangent
def tanh(Z):
    """
    To compute the value of the activation or the output of the layer with the tanh function.
    
        Parameters:
            Z - linear activation value- 

        Returns:
            
            A - node output
    """
    A = np.tanh(Z)
    return A

# Softmax function
def softmax(Z):
    """
    To compute the value of the activation or the output of the layer with softmax funxtion.
    
        Parameters:
            Z - linear activation value- 

        Returns:
            
            A - node output
    """
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0)
    return A

In [None]:
def z_comput(W, b, prev_A):
    
    """
    To compute the value of the linear activation
    
        Parameters:
            W - weight matrix of the layer
            b - bias matrix of the layer
            prev_A - previous activation values from the previous layer

        Returns:
            Z - linear activation value
            cache - tuple with the previous activations, weight matrix, and bias matrix
    """
    
    Z = np.dot(W,prev_A)+b # calculating the value of the linear activation
    lcache = (prev_A, W, b) # storing the values involved in this calculation in a tuple- linear cache

    return Z, lcache

In [None]:
def forward_prop(W, b, prev_A, activation):
    
    """
    To compute the value of the activation/ layer output
    
        Parameters:
            W - weight matrix of the layer
            b - bias matrix of the layer
            prev_A - previous activation values from the previous layer
            activation - activation function for the given layer

        Returns:
            A - node output
            cache - tuple with the previous activations, weight matrix, bias matrix, and linear activation (Z)
    """
    
    A = np.zeros((W.shape[0], prev_A.shape[1])) # initializing A
    cache = ()

    # computing the value of the output based on the activation function     
    if activation == "relu":
        Z, lcache = z_comput(W, b, prev_A) # obtaining the value of Z and weight, bias, previous activations
        A = relu(Z)
        zcache = Z
        cache = (lcache, zcache) # combined caches with previous activations, weights, bias, Z- linear activation value

    elif activation == 'sigmoid':
        Z, lcache = z_comput(W, b, prev_A)
        A = sigmoid(Z)
        zcache = Z
        cache = (lcache, zcache)

    elif activation == 'tanh':
        Z, lcache = z_comput(W, b, prev_A)
        A = tanh(Z)
        zcache = Z
        cache = (lcache, zcache)

    elif activation == 'softmax':
        Z, lcache = z_comput(W, b, prev_A)
        A = softmax(Z)
        zcache = Z
        cache = (lcache, zcache) 

    return A, cache

In [None]:
def forward_prop_layers(X, parameters, activations):
    """
    To compute the value of the activation for all the layers
    
        Parameters:
            X - X dataset without class attribute
            parameters - weight matrix and bias matrix of all the layers
            activations - names of the activation functions of all the layers

        Returns:
            A - activation value of all layers- of the forward pass
            cache - tuple with the previous activations, weight matrices, bias matrices and linear activations (Z)
    """
    A = X # the previous activation matrix is initialized with the X matrix for the calculation of the next activation value
    caches = []
    l = len(parameters)//2 # number of layers is half the size of the parameters dictionary (= number of weight/bias matrices)
    for i in range (l): # computing the activations for all layers and compiling all the caches
        prev_A = A # represents the previous activation value computed
        A, cache = forward_prop(parameters["W"+str(i+1)],parameters["b"+str(i+1)], prev_A, activations[i])
        caches.append(cache)
        
    return A, caches

### Computing the cost/loss

In [None]:
def compute_cost(A, Y, loss):
    """
    To compute the cost/loss
    
        Parameters:
            Y - actual values of the target class attribute
            A - activation values or predicted values
            loss- loss function to use

        Returns:
            cost1 - cost calculated using the specified loss function
    """
    n = Y.shape[1] # number of samples
    if loss == "c": # for cross entropy loss
        cost = (1./n) * (-np.dot(Y,np.log(A+1e-15).T) - np.dot(1-Y, np.log(1-A+1e-15).T))
    elif loss == "h": # for hinge loss
        cost = np.mean(np.maximum(0, 1 - Y * A))
    elif loss == "s": # for squared hinge loss
        cost = np.mean(np.maximum(0, 1 - Y * A))**2

    cost1 = np.squeeze(cost) # removing one-dimensional elements or extracting the cost value from the numpy array
    return cost1

### Backward Propagation

In [None]:
def derivative_compute(dZ, lcache):
    """
    To compute the derivatives/gradients of the weights, biases, and activation value
    
        Parameters:
            dZ - gradient of cost with respect to Z (linear activation)
            lcache - linear cache with the values of weights, biases and activations

        Returns:
            dprev_A - previous activation value
            dW - derivative of the weights of that layer
            db - derivate of the weights of that layer
    """
    
    prev_A, W, b = lcache # extracting each of the values in the lcache
    n = prev_A.shape[1] # number of samples
    
    # averaging the values over the nodes of the layers
    dW = 1./n * np.dot(dZ, prev_A.T)
    db = 1./n * np.sum(dZ, axis = 1, keepdims = True)
    
    dprev_A = np.dot(W.T, dZ)
    
    return dprev_A, dW, db

In [None]:
def backward_prop(dA, activation, cache):
    """
    To compute the value of derivatives for a layer
    
        Parameters:
            dA - derivative of the activation value
            activation- activation function to use
            cache - weights, biases, alinear activations and layer outputs

        Returns:
            dprev_A - previous activation value
            dW - derivative of the weights of that layer
            db - derivate of the weights of that layer
    """
    
    lcache, zcache = cache
    Z = zcache # the value of Z from zcache
    
    if activation == "relu":
        dZ = np.array(dA, copy=True) # dz is made into a numpy array for computations
        dZ[Z <= 0] = 0 # dz is set to 0 if Z is greater than or equal to 0 
        dprev_A, dW, db = derivative_compute(dZ, lcache) # to get the derivatives using the derivative_compute function
        
    elif activation == "sigmoid":
        s = 1/(1+np.exp(-Z))
        dZ = dA * s * (1-s)
        dprev_A, dW, db = derivative_compute(dZ, lcache)
        
    elif activation == "tanh":
        dZ = np.multiply(dA,
                         (1- ((np.tanh(Z))*(np.tanh(Z)))))
        dprev_A, dW, db = derivative_compute(dZ, lcache)
        
    elif activation == "softmax":
        dZ = np.multiply(dA, np.exp(Z) / sum(np.exp(Z)) * (1. - np.exp(Z) / sum(np.exp(Z))))
        dprev_A, dW, db = derivative_compute(dZ, lcache)

    return dprev_A, dW, db


In [None]:
def backward_prop_layers(last_A, Y, caches, activations):
    """
    To compute the value of derivatives for a layer
    
        Parameters:
            dA - gradient of the activation value
            activation- activation function to use
            cache - weights, biases, alinear activations and layer outputs

        Returns:
            cost1 - cost calculated using the specified loss function
    """

    gradient_dict = {} # dictionary to store the gradients
    l = len(caches) # number of layers = length of the caches
    
    Y = Y.reshape(last_A.shape) # reshping or chanding the dimensions of Y for compatibility for computations
    
    current_cache = caches[l - 1] # the last cache is chosen first to proceed backwards from the alast layer
    
    # calculating the first activation gradient
    gradient_dict["dA" + str(l)] = -(np.divide(Y, last_A+1e-15) -
                                     np.divide(1 - Y, 1 - last_A+1e-15))
    
    # the first set of gradients for Aactivations, weights and biases are calculated with the computed derivative of A
    gradient_dict["dA" + str(l - 1)], gradient_dict[
        "dW" + str(l)], gradient_dict["db" + str(l)] = backward_prop(
            gradient_dict["dA" + str(l)], activations[l - 1], current_cache)

    # calculating the gradients of weights, biases and activation values for all layers, starting from the second last layer in this loop
    for i in reversed(range(l - 1)):
        current_cache = caches[i]
        dprev_A, dW, db = backward_prop(gradient_dict["dA" + str(i + 1)],
                                        activations[i], current_cache)
        gradient_dict["dA" + str(i)] = dprev_A
        gradient_dict["dW" + str(i + 1)] = dW
        gradient_dict["db" + str(i + 1)] = db

    return gradient_dict

### Updating Parameters

In [None]:
def update_parameters(parameters, grad_dict, lr):
    """
    To update the parameters (weights and biases)
    
        Parameters:
            parameters - dictionary with weights and biases to update
            grad_dict - dictionary with gradients of weights, biases and activations
            lr- learning rate

        Returns:
            parameters - updated parameters dictionary
    """
    
    l = len(parameters) // 2 # number of layers in the model/network
    
    # for al layers, parameter = original parameter value - learning rate * gradient of the parameter calculation is done
    for i in range(l): 
        parameters["W" + str(i+1)] = parameters["W" + str(i+1)] - lr * grad_dict["dW"+str(i+1)]        
        parameters["b" + str(i+1)] = parameters["b" + str(i+1)] - lr * grad_dict["db"+str(i+1)]

    return parameters

### Generating Mini Batches

In [None]:
def random_mini_batches(X, y, batch_size):
    """
    To create mini batches randomly
    
        Parameters:
        X - input data without the target attribute
        y - target attribute with actual values
        batch_size - size of the mini-batches

        Returns:
        mini_batches - batches of the X and y sets
    """
    

    n = X.shape[1] # number of data samples
    mini_batches = []
    
    # shuffling the data to avoid having the same data together each time
    shuffle = list(np.random.permutation(n))
    X_shuffle = X.iloc[:, shuffle]
    y_shuffle = y[:, shuffle].reshape((1,n))

    num_complete_minibatches = n//batch_size # number of batches to make from the batch size
    
    # loop for making the batches as many as the number of batches calculated
    for i in range(num_complete_minibatches):
        
        mini_batch_X = X_shuffle.iloc[:,i*batch_size:(i+1)*batch_size]
        mini_batch_y = y_shuffle[:,i*batch_size:(i+1)*batch_size]
     
        mini_batch = (mini_batch_X, mini_batch_y)
        mini_batches.append(mini_batch)
    
    # when the number of data samples is lesser than the batch size considered, the last batch will have a size lesser than the batch size considered
    if n % batch_size != 0:
        
        mini_batch_X = X_shuffle.iloc[:,num_complete_minibatches*batch_size:n]
        mini_batch_y = y_shuffle[:,num_complete_minibatches*batch_size:n]
        
        mini_batch = (mini_batch_X, mini_batch_y)
        mini_batches.append(mini_batch)
    
    return mini_batches

### The Model

In [None]:
def ann(X, y, dimensions, lr, lr_decay, batch_size, epochs, loss, activations, gradient_alg):
    """
    Represents the artificial neural network with all components compiled together
    
        Parameters:
            X - input data without the class attribute
            y - actual values of the class attribute
            lr - learning rate
            lr_decay - whether to have a learning rate decay/schedule or not
            batch_size - size of the batches of data to consider
            epochs - number of iterations the computations are to be done for
            loss - loss function to use for the error calculation
            activations - list of the activation functions to use for each layer
            gradient_alg - gradient descent algorithm to use

        Returns:
            parameters1 - list of parameters, dimensions, activations, costs, lr, and batch_size
    """
    start = time.time()
    
    n = X.shape[1] # number of samples
    costs = [] # list of costs in each epoch
    
    # for learning rate decay
    alpha_zero = 0.2 
    bool_decay = False # there is no decay enabled by default 
    if(lr_decay == 'y'):
        bool_decay = True
        lr_decay_rate =  0.01 # decay rate
    
    parameters = initialize_parameters(dimensions) # initial values for the weights and biases
    
    if (gradient_alg == "b"): # the batch size is set as the number of samples, for batch gradient descent
        batch_size = X.shape[1]
    
    for i in range(epochs):
        
        # forming mini batches with the given batch size
        minibatches = random_mini_batches(X, y, batch_size)
        cost_total = 0
        
        # iterating over all the minibatches
        for minibatch in minibatches:
            
            (minibatch_X,minibatch_Y) = minibatch # to get the minibatch
            
            # forward propgation/pass is done and it outputs the activations and caches
            last_A, caches = forward_prop_layers(minibatch_X, parameters, activations) 
            
            # error calculation using the loss function
            cost_total += compute_cost(last_A, minibatch_Y, loss) 
            
            # backward propogation/pass outputs the gradients
            gradients = backward_prop_layers(last_A, minibatch_Y, caches, activations)
            
            # weights and biases are updated using the learning rate and gradients from backward pass
            parameters = update_parameters(parameters, gradients, lr)
            
            #If user has configured 'y' for learning decay, a learning rate schedule is implemeted
            if(bool_decay):
                lr = math.pow(0.5, math.floor((1+i)/10.0))  #Step decay 
                  
            else:
                bool_decay = False
                lr = lr

        # updating the cost by taking its total mean from the costs calculated in the different batches wherever applicable
        if gradient_alg == 'b':
            cost_avg = cost_total
        else: 
            cost_avg = cost_total / n 
            
        # printing the cost for every 10 epochs
        if i %10 == 0:
            print ("Cost/error after epoch %i: %f" %(i, cost_avg))
        costs.append(cost_avg)
          
    end = time.time()
    total_time = end-start # to calculate the total time taken for training the model
    
    print ("Cost/error after all epochs: ",(cost_avg))

    plt.plot(costs)
    plt.ylabel('Computed cost')
    plt.xlabel('Number of epochs')
    plt.title("Learning rate = " + str(lr))
    plt.show()

    # all values to return for later use in visualizations and predictions
    parameters1 = [parameters, dimensions, activations, costs, lr, batch_size, total_time]
    
    return parameters1

### Inference Function

In [None]:
def predict(X, y, parameters, activations):
    """
    To predict the labels after running the forward pass
    
        Arguments:
        X - input data without the class attribute
        y - actual values of the class attribute
        parameters - parameters (weights and biases) of the model after training
        activations - list of the activation functions to use for each layer

        Returns:
        pred -- predictions for the given input data- X
    """
    
    n = X.shape[1] # number of samples
    pred = np.zeros((1,n)) # initializing the predictions
    
    # Forward propagation with the given values
    A, caches = forward_prop_layers(X,parameters,activations)
    
    # for every prediction greater than 0.39, make it a 1 else a 0
    for i in range(0, A.shape[1]):
        if A[0,i] >= 0.39:
            pred[0,i] = 1
        else:
            pred[0,i] = 0
    
    # accuracy is computed by dividing the sum of the number of correct predictions by the total number of samples
    accuracy = np.sum((pred == y)/n)
    
    print("Accuracy: "  + str(accuracy))
        
    return pred, accuracy
    

### Training and Testing with inputted Parameters

In [None]:
parameters = ann(X_train.T, y_train.T, dimensions, lr, lr_decay, batch_size, epochs, loss, activations, gradient_alg)
params = parameters[0]
activations1 = parameters[2]

# Accuracy is displayed in the range of 0-1 where 1 is 100%.
print('Training Accuracy')
pred_train, acc = predict(X_train.T, y_train.T, params, activations1)
print('Testing Accuracy')
pred_test, acc = predict(X_test.T, y_test.T, params, activations1)

### Testing the model

In [None]:
# Some of the tested combinations:- traning function call has been followed tby the classification accuracy percentage

# parameters = ann(X_train.T, y_train.T, [30,8,8,8,1], 0.01, "n", 426, 800, "c", ["relu","relu","relu","sigmoid"], "b") #90.9%
# parameters = ann(X_train.T, y_train.T, [30,8,8,8,1], 0.01, "n", 426, 800, "h", ["relu","relu","relu","sigmoid"], "b") #88.9%
# parameters = ann(X_train.T, y_train.T, [30,21,25,59,1], 0.01, "n", 426, 1000, "c", ["relu","relu","relu","sigmoid"], "b") #92.3%
# parameters = ann(X_train.T, y_train.T, [30,20,25,21,59,1], 0.01, "n", 426, 1500, "c", ["relu","relu","relu","relu","sigmoid"], "b") #94.4%
# parameters = ann(X_train.T, y_train.T, [30,20,20,20,20,1], 0.00095, "y", 16, 1000, "c", ["relu","relu","relu","relu","sigmoid"], "m")  #94%
# parameters = ann(X_train.T, y_train.T, [30,20,21,20,25,1], 0.00095, "n", 16, 1500, "c", ["relu","relu","relu","relu","sigmoid"], "m") #92%
# parameters = ann(X_train.T, y_train.T, [30,20,20,20,1], 0.00095, "y", 16, 1500, "c", ["relu","relu","relu","relu","sigmoid"], "m")  #94.4%
# parameters = ann(X_train.T, y_train.T, [30,25,25,21,20,1], 0.01, "n", 1, 150, "c", ["relu","relu","relu","relu","sigmoid"], "s") #87.4%
# parameters = ann(X_train.T, y_train.T, [30,10,20,25,1], 0.01, "n", 1, 200, "c", ["relu","relu","relu","sigmoid"], "s") # 88%
# parameters = ann(X_train.T, y_train.T, [30,10,20,25,1], 0.01, "n", 426, 4500, "c", ["relu","relu","relu","sigmoid"], "b") #93%
# parameters = ann(X_train.T, y_train.T, [30,200,200,200,200,1], 0.00095, "y", 16, 800, "h", ["relu","relu","relu","relu","sigmoid"], "m") #93%
# parameters = ann(X_train.T, y_train.T, [30,21,25,30,1], 0.01, "n", 426, 1000, "c", ["relu","relu","relu","sigmoid"], "b") #90%
# parameters = ann(X_train.T, y_train.T, [30,21,25,30,1], 0.01, "y", 426, 500, "c", ["relu","relu","relu","sigmoid"], "s") #92.3%
# parameters = ann(X_train.T, y_train.T, [30,10,20,25,1], 0.01, "n", 426, 4500, "c", ["relu","relu","relu","sigmoid"], "b") # 92%
# parameters = ann(X_train.T, y_train.T, [30,20,21,25,1], 0.01, "n", 426, 150, "c", ["relu","relu","relu","sigmoid"], "s")# 90.2%

parameters = ann(X_train.T, y_train.T, [30,10,8,8,1], 0.03, "n", 426, 1000, "c", ["relu","tanh","relu","sigmoid"], "b")
params = parameters[0]
activations1 = parameters[2]
pred, acc = predict(X_test.T, y_test.T, params, activations1)

print("\n")

# Confusion matrix where tn-true negative, fp- false positive, fn- false negative, tp- true positive
tn, fp, fn, tp = confusion_matrix(y_test.reshape(1,y_test.shape[0])[0],pred[0]).ravel()
print(tn,fp,fn,tp)

# Visualizations

In [None]:
# Cost values charted after increasing epochs and decreasing learning rates simultaneously
lrs = [1e-1,1e-2,2e-2,3e-2,5e-2,6e-2,7e-2,8e-2,9e-2,1e-3,2e-3,3e-3,5e-3,6e-3,7e-3,8e-3,9e-3]
epoch = [50,60,80,100,140,200,250,300,500,1000,1500,2000]

for epochs in epoch:
    for lr1 in lrs:
        parameters = ann(X_train.T, y_train.T, [30,10,8,8,1], lr1, "n", 426, epochs, "c", ["relu","tanh","relu","sigmoid"], "b")
        print("Accuracy")
        predictions, acc = predict(X_test.T, y_test.T, parameters[0], parameters[2])


In [None]:
# Reporting Accuracy v/s Learning Rates, and Training time v/s Learning Rates using the follwing hyperparameters
lrs = [1e-1,1e-2,2e-2,3e-2,5e-2,6e-2,7e-2,8e-2,9e-2,1e-3,2e-3,3e-3,5e-3,6e-3,7e-3,8e-3,9e-3]
accs = []
times  = []
for lr1 in lrs:
    parameters = ann(X_train.T, y_train.T, [30,21,25,30,1], lr1, "y", 426, 500, "c", ["relu","relu","relu","sigmoid"], "b")
    print("Accuracy")
    predictions, acc = predict(X_test.T, y_test.T, parameters[0], parameters[2])
    accs.append(acc)
    times.append(parameters[6])
        
plt.scatter(lrs, accs)
plt.ylabel('Accuracy')
plt.xlabel('lr')
plt.title("Accuracy v/s Learning Rates")
plt.show()

plt.scatter(lrs, times)
plt.ylabel('Traning Time in seconds')
plt.xlabel('lr')
plt.title("Training time v/s Learning Rates")
plt.show()


In [None]:
# Reporting Accuracy v/s Number of Epochs, and Training time v/s Number of Epochs using the follwing hyperparameters
epoch = [50,60,80,100,140,200,250,300,500,1000,1500,2000]
accs = []
times  = []
for epochs in epoch:
    parameters = ann(X_train.T, y_train.T, [30,21,25,30,1], lr1, "y", 426, 500, "c", ["relu","relu","relu","sigmoid"], "b")
    print("Accuracy")
    predictions = predict(X_test.T, y_test.T, parameters[0], parameters[2])
    accs.append(acc)
    times.append(parameters[6])
        
plt.scatter(epoch, accs)
plt.ylabel('Accuracy')
plt.xlabel('Number of Epochs')
plt.title("Accuracy v/s Number of Epochs")
plt.show()

plt.scatter(epoch, times)
plt.ylabel('Training time in seconds')
plt.xlabel('Number of Epochs')
plt.title("Training time v/s Number of Epochs")
plt.show()