## Binary Classification Neural Net Sigmoid Batch Learning

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math 
from scipy.stats import truncnorm
import time

In [2]:
#loading and prepare data

train_data=np.loadtxt("data/assignment/overhead_mnist_train.csv",delimiter=",")
test_data=np.loadtxt("data/assignment/overhead_mnist_test.csv",delimiter=",")

In [3]:
# Extracting the feature vectors from the training data.
X_train=train_data[:,1:]

# Extracting the labels from the training data.
y_train=train_data[:,0]

# Extracting the feature vectors from the testing data.
X_test=test_data[:,1:]

# Extracting the labels from the testing data.
y_test=test_data[:,0]

In [4]:
# Checking the shapes of the feature and label datasets for both training and testing.
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Calculating a normalization factor
normalization_factor=0.99/X_train.max()

# Applying the normalization to the training features.
X_train=X_train * normalization_factor + 0.01

# Applying the same normalization to the testing features.
X_test=X_test * normalization_factor + 0.01

In [5]:
# Create a boolean array where entries are True for instances of class 7 in the training and testing labels, 
#then convert to float (1.0 for 
class7_train=([7]==y_train).astype(float)
class7_test=([7]==y_test).astype(float)

In [6]:
# Reshape the class 7 training and testing labels to a 2D array with a single column.
class7_train=class7_train.reshape(-1,1)
class7_test=class7_test.reshape(-1,1)

In [7]:
# Define the sigmoid activation function using numpy for vectorized operations, to be applied element-wise.

@np.vectorize
def sigmoid(a):
    return 1/(1+np.e**(-a))

# Define a function to generate values from a truncated normal distribution 
#within specified bounds, mean, and standard deviation.

def truncated_normal(low,upp,mean,sd):
    return truncnorm((low-mean)/sd,(upp-mean)/sd,scale=sd,loc=mean)

In [8]:
from scipy.special import expit as activate

In [9]:
class ANN:
    
    def __init__(self,no_inputs, hidden_layers, no_outputs,bias,learning_rate):
        
        self.no_inputs=no_inputs
        self.hidden_layers=hidden_layers
        self.no_outputs=no_outputs
        self.bias = bias
        self.learning_rate = learning_rate
        self.structure = [no_inputs] + hidden_layers + [no_outputs]
        self.create_weight()
    
    
    def create_weight(self):
        bias_node = 1 if self.bias else 0
        self.len_network = len(self.structure)
        layer_index = 0
        self.weights = []
        
        while layer_index < self.len_network - 1:
            nodes_out = self.structure[layer_index + 1]
            nodes_in = self.structure[layer_index]
            n = nodes_out * (nodes_in + bias_node)
            rad = 1 / np.sqrt(nodes_in)
            X=truncated_normal(low=-rad,upp=rad,mean=0,sd=1)
            wm = X.rvs(n).reshape((nodes_out, nodes_in + 1))
            self.weights.append(wm)
            layer_index += 1
        return self.weights
    
    def alternative_weight_init(self):
    
    
    #This weight initialization demonstrates an alternative approach to initializing weights for the neural network
    
    #Directly alligned with the pseudocode in the template
    
    # TO-DO:
        #for layer in range(len(hidden_layers)):
            #no_nodes = hidden_layers[layer]
            # no_inputs_to_layer = ??
            # initialise weight matrix of shape: (no_nodes, no_inputs_to_layer)
  
    
        bias_node=1 if self.bias else 0
    
        self.weights_new=[]
    
        for layer in range(len(self.hidden_layers)+1):
        
            nodes_in=self.hidden_layers[layer-1]
        
            if layer==0:
                nodes_in=self.no_inputs+bias_node
            
            if layer==3:
                nodes_out=self.no_outputs
            else:
                nodes_out=self.hidden_layers[layer]

            n = nodes_out * (nodes_in + bias_node)

            rad = 1 / np.sqrt(nodes_in)

            X=truncated_normal(low=-rad,upp=rad,mean=0,sd=1)

            wm = X.rvs(n).reshape((nodes_out, nodes_in + 1))

            self.weights_new.append(wm)

            layer += 1
        
        return self.weights_new
    
    
    def train_batch(self, X_train, labels, epochs, batch_size):
        self.combined_weights = []

        for epoch in range(epochs):
            indices = np.arange(X_train.shape[0])
            np.random.shuffle(indices)
            X_train_shuffled = X_train[indices]
            labels_shuffled = labels[indices]

            for start_idx in range(0, X_train.shape[0], batch_size):
                end_idx = min(start_idx + batch_size, X_train.shape[0])
                batch_x = X_train_shuffled[start_idx:end_idx]
                batch_y = labels_shuffled[start_idx:end_idx]

                result_vector = [batch_x.T]
                layer_index = 0

                while layer_index < self.len_network - 1:
                    input_vector = result_vector[-1]

                    if self.bias:
                        bias_terms = np.ones((1, input_vector.shape[1])) * self.bias
                        input_vector = np.concatenate((input_vector, bias_terms), axis=0)
                        result_vector[-1] = input_vector
                    
                    output_z = np.dot(self.weights[layer_index], input_vector)
                    output_activation = activate(output_z)
                    result_vector.append(output_activation)
                    layer_index += 1

                output_error = batch_y.T - result_vector[-1]
                backward_index = self.len_network - 1

                while backward_index > 0:
                    output_activation = result_vector[backward_index]
                    input_activation = result_vector[backward_index - 1]
                    gradient = np.dot(output_error, input_activation.T) / batch_x.shape[0]
                    self.weights[backward_index - 1] += self.learning_rate * gradient
                    output_error = np.dot(self.weights[backward_index - 1].T, output_error)

                    if self.bias:
                        output_error = output_error[:-1, :]
                    backward_index -= 1
            
            self.combined_weights.append(self.weights.copy())
        return self.combined_weights

    def predict(self, input_vector):
        input_vector = np.array(input_vector, ndmin=2).T
        result_vector = [input_vector]
        layer_index = 0

        while layer_index < self.len_network - 1:
            input_vector = result_vector[-1]

            if self.bias:
                input_vector = np.concatenate((input_vector, [[self.bias]]))
                result_vector[-1] = input_vector

            output_z = np.dot(self.weights[layer_index], input_vector)
            output_activation = activate(output_z)
            result_vector.append(output_activation)
            layer_index += 1
        
        return output_activation
        
    def accuracy(self, data_array, labels):
        correct, wrong = 0, 0

        for i in range(len(data_array)):
            predicted = self.predict(data_array[i])
            predicted = np.where(predicted[0][0] >= 0.5, 1, 0)
            actual_label = int(labels[i][0])

            if predicted == actual_label:
                correct += 1
            else:
                wrong += 1

        accuracy = correct / (correct + wrong)
        return accuracy

    def confusion_matrix(self, data_array, labels):
        cm = np.zeros((2, 2), int)

        for i in range(len(data_array)):
            predicted = self.predict(data_array[i])
            predicted = np.where(predicted >= 0.15, 1, 0)
            actual_label = int(labels[i][0])
            cm[actual_label, predicted] += 1

        return cm

    def precision_recall(self, label, confusion_matrix):
        
        true_positives=confusion_matrix[0,0]+confusion_matrix[1,1]
        false_positives=confusion_matrix[0,1]
        false_negatives=confusion_matrix[1,0]
        
        precision=true_positives/(true_positives+false_positives)
        recall=true_positives/(true_positives+false_negatives)
      
        print("Accuracy:\t"+str(accuracy))
        print("Precision:\t"+str(precision))
        print("Recall:\t"+str(recall))
        print(confusion_matrix)
        


In [22]:
start_binary_neural_training_time = time.time()

binary_exclusive=ANN(784,[100,80,50],1,1,0.5) #an instance of a class exclusively built for binary classification

bi_weights=binary_exclusive.train_batch(X_train,class7_train,100,64)

bi_accuracy=binary_exclusive.accuracy(X_test, class7_test)

end_binary_neural_training_time = time.time()

time_taken_binary_training=end_binary_neural_training_time - start_binary_neural_training_time

print(f"Execution time: {time_taken_binary_training} seconds")

Execution time: 285.80098581314087 seconds


In [26]:
bi_accuracy=binary_exclusive.accuracy(X_test, class7_test)
print(f"accurcy of binary neural network with sigmoid is {bi_accuracy}")

accurcy of binary neural network with sigmoid is 0.895774647887324


## Updating the ANN implement to allow binary or multi-class classification

In [10]:
# Reshape the training labels to a two-dimensional array with a single column, preparing it for one-hot encoding.
y_train_full_encoded = y_train.reshape(-1, 1)

# Reshape the testing labels similarly to the training labels, ensuring consistency in data structure.
y_test_full_encoded = y_test.reshape(-1, 1)

# Create an array representing class labels for a classification task with 10 classes (0 through 9).
class_encoder = np.arange(10)

# Perform one-hot encoding on the training labels. For each label in `y_train_full_encoded`, create an array where the index
y_train_encoded = (class_encoder == y_train_full_encoded).astype(float)

# Apply the same one-hot encoding process to the testing labels, transforming them into a binary matrix representation
y_test_encoded = (class_encoder == y_test_full_encoded).astype(float)


In [11]:
class ANN:
    
    # Initializes an ANN with specified architecture, bias, learning rate, and creates initial weights.
    
    def __init__(self,no_inputs, hidden_layers, no_outputs,bias,learning_rate):
        
        self.no_inputs=no_inputs
        self.hidden_layers=hidden_layers # List indicating the number of neurons in each hidden layer.
        self.no_outputs=no_outputs
        self.bias = bias
        self.learning_rate = learning_rate
        self.structure = [no_inputs] + hidden_layers + [no_outputs] # Full network structure including input, hidden, and output layers.
        self.create_weight() # Calls function to initialize weights based on the network structure.
    
    
    # Creates and initializes weights for each layer in the network.
    def create_weight(self):
        bias_node = 1 if self.bias else 0
        self.len_network = len(self.structure) # Length of the structure list indicating the number of layers.
        layer_index = 0 # Index to iterate through layers.
        self.weights = []  # List to hold weight matrices.
        
        # Iterates through the layers to create weight matrices between them.
        
        while layer_index < self.len_network - 1:
            nodes_out = self.structure[layer_index + 1]
            nodes_in = self.structure[layer_index]
            n = nodes_out * (nodes_in + bias_node) # Total number of weights needed.
            rad = 1 / np.sqrt(nodes_in)  # Radial basis for weight initialization range.
            X=truncated_normal(low=-rad,upp=rad,mean=0,sd=1) # Truncated normal distribution for weight initialization.
            wm = X.rvs(n).reshape((nodes_out, nodes_in + 1)) # Reshaping the randomly sampled weights.
            self.weights.append(wm)
            layer_index += 1
        return self.weights
    
    # Trains the network using batch learning over a specified number of epochs.
    
    def train_batch(self, X_train, labels, epochs, batch_size):
        self.combined_weights = []  # List to store weights after each batch update for analysis.

        for epoch in range(epochs):

            # Loop over each batch within the dataset.
            
            for start_idx in range(0, X_train.shape[0], batch_size):
                end_idx = min(start_idx + batch_size, X_train.shape[0]) # Ensures the batch does not exceed dataset size.
                batch_x = X_train[start_idx:end_idx] # Extracts features for the current batch.
                batch_y = labels[start_idx:end_idx] # Extracts labels for the current batch.

                result_vector = [batch_x.T] # Prepares the input vector for the forward pass
                layer_index = 0 # Sets layer index for forward pass.

                while layer_index < self.len_network - 1:
                    input_vector = result_vector[-1] # Takes the last layer's output as the current input.

                    if self.bias:  # Adds bias to the input vector if bias is enabled.
                        bias_terms = np.ones((1, input_vector.shape[1])) * self.bias
                        input_vector = np.concatenate((input_vector, bias_terms), axis=0)
                        result_vector[-1] = input_vector
                    
                    output_z = np.dot(self.weights[layer_index], input_vector)
                    output_activation = activate(output_z)
                    result_vector.append(output_activation)
                    layer_index += 1

                output_error = batch_y.T - result_vector[-1] # Computes error at the output layer.
                
                backward_index = self.len_network - 1 # Sets backward index for backpropagation.
                
                while backward_index > 0:
                    
                    output_activation = result_vector[backward_index]  # Activation of current layer.
                    input_activation = result_vector[backward_index - 1] # Activation of previous layer
                    
                    # Adjusts the activation for bias nodes not in the output layer.
                    
                    if self.bias and backward_index!=self.len_network - 1 and self.structure[-1]>1:
                        
                        output_activation=output_activation.copy()[:-1,:]
                    
                    # Applies derivative of the activation function for non-output layers.
                        
                    if self.structure[-1]>1:
                
                        output_error=output_error*output_activation*(1-output_activation)
                    
                    gradient = np.dot(output_error, input_activation.T) / batch_x.shape[0]

                    self.weights[backward_index - 1] += self.learning_rate * gradient # Updates weights.
                    
                    output_error = np.dot(self.weights[backward_index - 1].T, output_error) # Calculates error for previous layer.

                    # Adjusts error for bias nodes.

                    if self.bias: 
                        
                        output_error = output_error[:-1, :]
                    
                    backward_index -= 1 # Moves to the previous layer.
                
                self.combined_weights.append(self.weights) # Stores updated weights after each batch.
        return self.combined_weights   # Returns weights after training.

    def predict(self, input_vector): #this repeats the forward propagation of the above-explained training process to predict
        
        input_vector = np.array(input_vector, ndmin=2).T
        result_vector = [input_vector]
        layer_index = 0

        while layer_index < self.len_network - 1:
            input_vector = result_vector[-1]

            if self.bias:
                input_vector = np.concatenate((input_vector, [[self.bias]]))
                result_vector[-1] = input_vector

            output_z = np.dot(self.weights[layer_index], input_vector)
            output_activation = activate(output_z)
            result_vector.append(output_activation)
            layer_index += 1
        
        return output_activation
        
    # Calculates accuracy of the model on a given dataset.
    def accuracy(self, data_array, labels):
        correct, wrong = 0, 0 # Initializes counters for correct and incorrect predictions

        # Iterates over the dataset to predict and compare with actual labels.
        
        for i in range(len(data_array)):
            
            # For multi-class classification, uses argmax to find predicted class.
            
            if self.structure[-1]>1:
                
                predicted=self.predict(data_array[i]) # Predicts based on input.
                predicted_max=np.argmax(predicted) # Finds class with highest probability.
                actual_label=labels[i]  # Actual class label.
                
                 # Updates counters based on prediction accuracy.
                    
                if predicted_max==actual_label:
                    correct += 1
                else:
                    wrong += 1
                accuracy=correct/(correct+wrong)
            
            # For binary classification, thresholds the predicted value.
            
            else:
                
                predicted = self.predict(data_array[i])
                predicted = np.where(predicted[0][0] >= 0.5, 1, 0) # Applies threshold to prediction.
                actual_label = labels[i]
                
                
                # Updates counters based on prediction accuracy.

                if predicted == actual_label:
                    correct += 1
                else:
                    wrong += 1
                accuracy=correct/(correct+wrong)

        
        return accuracy

    def confusion_matrix(self, data_array, labels): # Generates a confusion matrix for the model predictions on a given dataset
        
        length=len(np.unique(labels))  # Determines size of the confusion matrix based on number of unique labels.
        
        cm=np.zeros((length,length),int) # Initializes the confusion matrix with zeros.
        
        # Iterates over the dataset to fill the confusion matrix.
        
        for i in range(len(data_array)):
            
            # For multi-class classification, identifies the predicted class and actual class.
            
            if self.structure[-1]>1:

                predicted=self.predict(data_array[i])
                predicted_max=np.argmax(predicted) # Finds class with highest probability.
                actual_label=int(labels[i])
                cm[actual_label,predicted_max]+=1  # Updates confusion matrix.

            else:

                predicted = self.predict(data_array[i])
                predicted = np.where(predicted >= 0.15, 1, 0)
                actual_label = int(labels[i])
                cm[actual_label, predicted] += 1

        return cm

    # Calculates precision and recall for a given class label based on the confusion matrix.
   
    def precision_recall(self, label, confusion_matrix):
        
        # For multi-class classification, calculates precision and recall for the specified label.
        
        if self.structure[-1]>1:
            
            true_positives=confusion_matrix[label,label]

            true_positive_plus_false_positives=confusion_matrix[:,label].sum()

            true_positives_plus_false_negatives=confusion_matrix[label,:].sum()
            
            precision=true_positives/true_positive_plus_false_positives

            recall=true_positives/true_positives_plus_false_negatives

            
         # For binary classification, simplifies precision and recall calculation.
        
        else:
            
            true_positives=confusion_matrix[0,0]+confusion_matrix[1,1]
            false_positives=confusion_matrix[0,1]
            false_negatives=confusion_matrix[1,0]
        
            precision=true_positives/(true_positives+false_positives)
            recall=true_positives/(true_positives+false_negatives)
     
        return precision,recall

### Testing ANN for Multiclass Classification

In [12]:
start_mutliclass_training_time = time.time()
multiclass_sigmoid=ANN(784,[3,4,5],10,1,0.5)
multiclass_weights=multiclass_sigmoid.train_batch(X_train,y_train_encoded,100,64)
end_mutliclass_training_time = time.time()
time_taken_multiclass_training=end_mutliclass_training_time-start_mutliclass_training_time
print(f"Execution time: {time_taken_multiclass_training} seconds")

Execution time: 55.484747886657715 seconds


In [13]:
accuracy_multiclass=multiclass_sigmoid.accuracy(X_test,y_test)
print(f"The accuracy for the multiclass classification is {accuracy_multiclass}")

The accuracy for the multiclass classification is 0.3530516431924883


In [14]:
multiclass_confusion_matrix=multiclass_sigmoid.confusion_matrix(X_test,y_test)
print(multiclass_confusion_matrix)

[[518 123   4  35  13  32  38  60   8  65]
 [117 248  17  83  72  30 189  77   7  48]
 [ 10  38 133 131   5 258  63   1   7  10]
 [  4  21  87 293  28 165 257   0  12  13]
 [ 11 125  48 168 135  76 289   2  17  25]
 [ 18  17  66 112  11 629  21   1   0  13]
 [ 17  30  22 268 126  38 276   0   7  16]
 [122  48  11  35   8  24  48 495  10  87]
 [  4 110  24 213 131  47 268   0  15  28]
 [103  77  34  90  43 155  54  45  21 266]]


In [15]:
for i in range(10):
    
    multiclass_precision,multiclass_recall=multiclass_sigmoid.precision_recall(i,multiclass_confusion_matrix)
    print(f"precision for label {i} is {multiclass_precision},while recall for label {i} is {multiclass_recall}")

precision for label 0 is 0.5606060606060606,while recall for label 0 is 0.578125
precision for label 1 is 0.2962962962962963,while recall for label 1 is 0.27927927927927926
precision for label 2 is 0.2982062780269058,while recall for label 2 is 0.2027439024390244
precision for label 3 is 0.20518207282913165,while recall for label 3 is 0.33295454545454545
precision for label 4 is 0.23601398601398602,while recall for label 4 is 0.15066964285714285
precision for label 5 is 0.4325997248968363,while recall for label 5 is 0.7083333333333334
precision for label 6 is 0.18363273453093812,while recall for label 6 is 0.345
precision for label 7 is 0.7268722466960352,while recall for label 7 is 0.5574324324324325
precision for label 8 is 0.14423076923076922,while recall for label 8 is 0.017857142857142856
precision for label 9 is 0.4658493870402802,while recall for label 9 is 0.29954954954954954


### Testing ANN for binary classification

In [16]:
start_binary_training_time = time.time()

binary_sigmoid=ANN(784,[3,4,5],1,1,0.5)

binary_weights=binary_sigmoid.train_batch(X_train,class7_train,100,64)

end_binary_training_time = time.time()

time_taken_binary_training=end_binary_training_time - start_binary_training_time

print(f"Execution time: {time_taken_binary_training} seconds")

Execution time: 46.184836864471436 seconds


In [17]:
accuracy_binary=binary_sigmoid.accuracy(X_test,class7_test)
print(f"Accuracy score for binary classification on Label 7--Ship-- is {accuracy_binary}")

Accuracy score for binary classification on Label 7--Ship-- is 0.895774647887324


In [18]:
binary_confusion_matrix=binary_sigmoid.confusion_matrix(X_test,class7_test)
binary_confusion_matrix

array([[7632,    0],
       [ 888,    0]])

## Updating the ANN to use the ReLU activation function 

In [19]:
def relu_activation(x):
    return np.maximum(0, x)


class ANN_RELU:
    
    # Initialize an ANN instance with specified architecture, bias, and learning rate.
    
    def __init__(self,no_inputs, hidden_layers, no_outputs,bias,learning_rate):
        
        self.bias = bias 
        self.learning_rate = learning_rate
        self.no_outputs=no_outputs
        self.no_inputs=no_inputs
        self.hidden_layers=hidden_layers
        self.structure = [no_inputs] + hidden_layers + [no_outputs] # Complete network structure.
        self.create_weight() # Call the method to initialize the weights of the network.
        

    # Method to create and initialize the weights of the network.
    
    def create_weight(self):
        bias_node = 1 if self.bias else 0    # Determine if a bias node should be added (1) or not (0).
        self.len_network = len(self.structure)
        layer_index = 0  # Start index for iterating through layers.
        self.weights = []  # Initialize an empty list to store the weight matrices
        
        # Loop through each layer to initialize weights until the second last layer.
        
        while layer_index < self.len_network - 1:
            
            nodes_out = self.structure[layer_index + 1] # Number of nodes in the next layer.
            
            nodes_in = self.structure[layer_index]
            
            n = nodes_out * (nodes_in + bias_node) # Total number of weights to initialize for the layer
            
            rad = 1 / np.sqrt(nodes_in)  # Scaling factor for weight initialization.
            
            X=truncated_normal(low=-rad,upp=rad,mean=0,sd=1) # Create a truncated normal distribution for weight initialization.
            
            wm = X.rvs(n).reshape((nodes_out, nodes_in + 1)) # Randomly sample weights and reshape into a matrix.
            
            self.weights.append(wm) # Append the weight matrix to the weights list.
            
            layer_index += 1 # Move to the next layer index.
            
        return self.weights
    
    
    def alternative_weight_init(self):
    
    
    #This weight initialization demonstrates an alternative approach to initializing weights for the neural network
    
    #Directly alligned with the pseudocode in the template
    
    # TO-DO:
        #for layer in range(len(hidden_layers)):
            #no_nodes = hidden_layers[layer]
            # no_inputs_to_layer = ??
            # initialise weight matrix of shape: (no_nodes, no_inputs_to_layer)
  
    
        bias_node=1 if self.bias else 0
    
        self.weights_new=[]
    
        for layer in range(len(self.hidden_layers)+1):
        
            nodes_in=self.hidden_layers[layer-1]
        
            if layer==0:
                nodes_in=self.no_inputs+bias_node
            
            if layer==3:
                nodes_out=self.no_outputs
            else:
                nodes_out=self.hidden_layers[layer]

            n = nodes_out * (nodes_in + bias_node)

            rad = 1 / np.sqrt(nodes_in)

            X=truncated_normal(low=-rad,upp=rad,mean=0,sd=1)

            wm = X.rvs(n).reshape((nodes_out, nodes_in + 1))

            self.weights_new.append(wm)

            layer += 1
        
        return self.weights_new

    
    def train_batch(self, X_train, labels, epochs, batch_size):
        
        self.combined_weights = []  # Initialize a list to store weights after each update.

        for epoch in range(epochs): # Iterate over each epoch.
            
            # Iterate over batches of the training data.
            
            for start_idx in range(0, X_train.shape[0], batch_size): 
                end_idx = min(start_idx + batch_size, X_train.shape[0]) # Determine the end index of the batch.
                batch_x = X_train[start_idx:end_idx] # Extract the batch of features.
                batch_y = labels[start_idx:end_idx] # Extract the batch of labels.

                result_vector = [batch_x.T] # Initialize the result vector with the batch's input.
                layer_index = 0 # Index to iterate through layers.

                # Forward propagation
                
                while layer_index < self.len_network - 1:
                    input_vector = result_vector[-1]

                    if self.bias:
                        bias_terms = np.ones((1, input_vector.shape[1])) * self.bias
                        input_vector = np.concatenate((input_vector, bias_terms), axis=0)
                        result_vector[-1] = input_vector
                    
                    output_z = np.dot(self.weights[layer_index], input_vector)
                    output_activation = relu_activation(output_z)
                    result_vector.append(output_activation)
                    layer_index += 1
                
                output_error = batch_y.T - result_vector[-1]  # Calculate the output error.
                
                # Backpropagation
                
                backward_index = self.len_network - 1 # Start from the last layer.
                
                while backward_index > 0:
                    
                    output_activation = result_vector[backward_index] # Get the activation for the current layer.
                    input_activation = result_vector[backward_index - 1] # Get the input activation.
                    
                    # Remove bias terms from the output activation if not in the output layer.
                    
                    if self.bias and backward_index!=self.len_network - 1 and self.structure[-1]>1:
                        
                        output_activation=output_activation.copy()[:-1,:]
                        
                    # Apply derivative of activation function if not output layer.
                    
                    if self.structure[-1]>1:
                        
                        output_error=output_error*output_activation*(1-output_activation)
        
                    relu_derivative=np.where(output_activation<=0,0,1)  # Derivative of ReLU.
                
                    output_error=output_error *relu_derivative
                    
                    
                    # Gradient calculation with clipping to prevent exploding gradients.
                        
                    clip_value = 1.0  # to prevent incompatible gradients
                    
                    gradient = np.dot(output_error, input_activation.T) / batch_x.shape[0]
                
                    gradient = np.clip(gradient, -clip_value, clip_value)
                    
                    # Check for NaN values in the weights.

                    self.weights[backward_index - 1] += self.learning_rate * gradient
                
    
                    for weight_matrix in self.weights:
                        assert not np.isnan(weight_matrix).any(), "Weights have NaN values"

                    # Update the error for the previous layer.
                    
                    output_error = np.dot(self.weights[backward_index - 1].T, output_error)
    

                    if self.bias: 
                        
                        output_error = output_error[:-1, :]
                    
                    
                    backward_index -= 1
                
                
                
                self.combined_weights.append(self.weights)
                
                
        return self.combined_weights

    def predict(self, input_vector):
        
        input_vector = np.array(input_vector, ndmin=2).T
        result_vector = [input_vector]
        layer_index = 0

        while layer_index < self.len_network - 1:
            input_vector = result_vector[-1]

            if self.bias:
                input_vector = np.concatenate((input_vector, [[self.bias]]))
                result_vector[-1] = input_vector

            output_z = np.dot(self.weights[layer_index], input_vector)
            output_activation = relu_activation(output_z)
            result_vector.append(output_activation)
            layer_index += 1
        
        return output_activation
        
    def accuracy(self, data_array, labels):
        correct, wrong = 0, 0 # Initialize counters for correct and wrong predictions.

        for i in range(len(data_array)):
            
            # For multi-class classification.
            
            if self.structure[-1]>1:
                
                predicted=self.predict(data_array[i]) # Get the model's prediction.
                predicted_max=np.argmax(predicted) # Find the class with the highest probability
                actual_label=labels[i] # Get the actual label
                
                # Increment the correct or wrong counter based on the prediction accuracy
                
                if predicted_max==actual_label:
                    correct += 1
                else:
                    wrong += 1
                    
                # Calculate accuracy as the ratio of correct predictions to total predictions.
                
                accuracy=correct/(correct+wrong)
            
              
            else:
                
                predicted = self.predict(data_array[i]) # Get the model's prediction.
                predicted = np.where(predicted[0][0] >= 0.5, 1, 0)  # Threshold the prediction at 0.5.
                actual_label = labels[i]

                if predicted == actual_label:
                    correct += 1
                else:
                    wrong += 1
                accuracy=correct/(correct+wrong)

        
        return accuracy
    
   
    def confusion_matrix(self, data_array, labels):
        
        length=len(np.unique(labels)) # Determine the size of the confusion matrix
        
        cm=np.zeros((length,length),int) # Initialize the confusion matrix with zeros
        
        # Iterate over each example in the dataset
        
        for i in range(len(data_array)):
            
            # For multi-class classification.
            
            if self.structure[-1]>1:

                predicted=self.predict(data_array[i])
                predicted_max=np.argmax(predicted)
                actual_label=int(labels[i])
                cm[actual_label,predicted_max]+=1 # Increment the corresponding cell in the confusion matrix

            else:

                predicted = self.predict(data_array[i])
                predicted = np.where(predicted >= 0.15, 1, 0)
                actual_label = int(labels[i])
                cm[actual_label, predicted] += 1

        return cm
    
    
  
    def precision_recall(self, label, confusion_matrix):
        
    # For multi-class classification.
        if self.structure[-1] > 1:

            true_positives = confusion_matrix[label, label]  # Get true positives for the given label.

            # Calculate the sum of the column for the given label to get true positives plus false positives.
            true_positive_plus_false_positives = confusion_matrix[:, label].sum()

            # Calculate the sum of the row for the given label to get true positives plus false negatives.
            true_positives_plus_false_negatives = confusion_matrix[label, :].sum()

            # Calculate precision and recall for the given label.
            precision = true_positives / true_positive_plus_false_positives
            recall = true_positives / true_positives_plus_false_negatives

        # For binary classification.
        else:

            true_positives = confusion_matrix[0, 0] + confusion_matrix[1, 1]  # Sum of the diagonal elements for true positives.
            false_positives = confusion_matrix[0, 1]  # False positives.
            false_negatives = confusion_matrix[1, 0]  # False negatives.

            # Calculate precision and recall for binary classification.
            precision = true_positives / (true_positives + false_positives)
            recall = true_positives / (true_positives + false_negatives)

        return precision, recall  # Return the calculated precision and recall.



In [20]:
start_RELU_training_time = time.time()

relu_neural=ANN_RELU(784,[100,80,50],10,1,0.001)

relu_weights=relu_neural.train_batch(X_train,y_train_encoded,100,64)

end_RELU_training_time = time.time()

time_taken_RELU_training=end_RELU_training_time - start_RELU_training_time

print(f"Execution time: {time_taken_RELU_training} seconds")

accuracy_relu=relu_neural.accuracy(X_test,y_test)
print(f"Accuracy for RELU is {accuracy_relu}")

Execution time: 360.53876781463623 seconds
Accuracy for RELU is 0.16490610328638497


## Analysis for sigmoid neural network

●How much better are the results for object recognition, compared to the single-layer perceptron? 

Interestingly, the sigmoid neural network performed closely in accuracy (accuracy of 89.56%) to the single-perceptron sigmoid (accuracy of 89.87%). However, the sigmoid neural network took far more time to train.


● How did you modify the initial weights, learning rate, and iterations to achieve this? 

I significantly increased the number of nodes in the hidden network to improve learning and avoid overfitting, enabling the model to better generalize to unseen data.

● How much faster/slower is the training time, compared to the single-layer perceptron? 

Took far more training time: 285.80098581314087 seconds

● How much quicker/slower does the learning converge, compared to the single-layer perceptron?  The single-layer perceptron converged way quicker than its multi-layer sigmoid equivalent.