Deep Learning Assignment 1: implementing a Neural Network


Part 1: Implement Logistic Regression

The primary 'fit' method (inspired from Scikit learns similar method) takes as input a list of input values 'pred_vars' of arbitrary number of feature values (so long as it is consistent) and the corresponding target variable list of binary values.
The other paramaters are considered hyper parameters, all with a default value considered a 'good default value' by many sources.

The logistic regressor then operates as a normal one would from closely following the lecture notes, using Stochastic Gradient Descent in minimising the cost function.
    

In [79]:
#Import necessary libraries
import numpy as np     #For basic matrix and list operations
import random as rand  #For random number generation
seed = 999
rand.seed(seed)        #So results are reproducible for comparisons
import pandas as pd    #For loading csv files and having clean data frames


class Logistic:
    
    #Primary funtion which learns the weights and biases for prediction
    def fit(self, pred_vars,target_var,learning_rate = 0.1 ,max_iter = 10000 ,threshold = 1e-6):

        #Let x and y be the predictor and target variables respectively
        self.x = pred_vars        
        self.y = target_var 
        self.num_vars = len(pred_vars[0])    #store number of predictor variables
        self.num_points = len(target_var)    #store number of data points supplied

        #randomly initialise w and b to float in  range [0,1]
        self.w = [rand.random() for i in range(self.num_vars)]
        self.b = rand.random()


        iters = 0   #To store current iteration of regressor in order to check if max_iterations has been reached
        j_prev = 0  #To store the previous loss value to check against current
                    #loss value if we have reached a sufficient minima (relating to threshold)

        while iters < max_iter:   #if max_iters reached, stop learning
            rand_num = rand.sample(range(self.num_points),1)[0]  #take 1 random number in range [0,num_points-1] as python indexes starting with 0
            #Extract this random random point from the data 
            x_i = self.x[rand_num] 
            y_i = self.y[rand_num]

            y_hat = self.sigmoid(np.dot(self.w,x_i) + self.b) #representing probability of point x having class 1
            j_curr = -(y_i*np.log(y_hat) + (1-y_i)*np.log(1-y_hat)) #objective function we wish to minimise 

            if abs(j_curr - j_prev) < threshold: #compare j value to previous one. if a given threshold has been reached , the algorithm has 'Learned enough'
                #print('Threshold reached after ' + str(iters) + ' iterations')
                break  #Stop learning

            #Gradient Descent stage
            #Adjust our previous values of w and b subject to our guess for what the target variable was and what it actually is
            delta_w = [(y_hat - y_i)*x_i[j] for j in range(self.num_vars)]
            delta_b = y_hat - y_i

            #Update the values using the specified learning rate
            self.w = [self.w[j] - learning_rate*delta_w[j] for j in range(self.num_vars)]
            self.b -= learning_rate*delta_b

            #Learning iteration complete..
            iters += 1    #increment iteration at end
            j_prev = j_curr  #update previous loss value to current one



        return None #Stores values listed but doesn't explicitely return a value

    def sigmoid(self,z): #Trademark activation function for Logistic Regression (sigmoid/logistic function)
        return 1/(1+np.exp(-z))

    #Method to predict list of points of same number of fetures as those in training data
    def predict(self,x_data, hard_threshold = 0.5):
        #Hard threshold is in range[0,1] and decides if class labelled as 0 or 1. If HT is higher then more points will be
        #classed as 1 and vice versa for 0. Best kept at 0.5 but can be varied if desired.
        
        y_hat = [self.sigmoid(np.dot(self.w,x) + self.b) for x in x_data] #probability point is of class 1

        self.prediction = [int(y >= hard_threshold) for y in y_hat] #compare to HT, if greator or equal then label class 0, else class 1
        return self.prediction

Part 2  Train a logistic regressor using your code from Part 1, and see how it 
performs on both datasets

In [99]:
#Also using function for my case studies assignment
#data must be in form of pandas data frame, specify probabilities of train, validation, test
def train_valid_test(data,p_train,p_validation,p_test):
    num_data_points = len(data) #Store number of points
    #Get number of validation and test points to use from probabilities give.
    number_of_validation_points = int(num_data_points*p_validation) 
    number_of_test_points = int(num_data_points*p_test)
    
    #list of random indexes to sample from data for validation set, sorted to select these rows from data later
    rows_for_validation = sorted(rand.sample(range(num_data_points), number_of_validation_points))

    rows_for_test = []
    #Populate rows for test incrementally from those not selected for validation
    while len(rows_for_test) < number_of_test_points: 
        rand_number = rand.sample(range(num_data_points), 1)[0] # 1 random number in range [0,number data points-1]
        if rand_number not in rows_for_validation + rows_for_test: #check if number is in validation selection
            rows_for_test.append(rand_number) # if not then add to rows for test

    rows_for_test = sorted(rows_for_test) #Sort this to subset data by the indexes in this list

    #by elimination, add the remaining rows to train set 
    rows_for_train = [i for i in range(num_data_points) if i not in sorted(rows_for_validation + rows_for_test)]

    #subset data into respective train, validation, test
    train_data = data.loc[rows_for_train]
    validation_data = data.loc[rows_for_validation]
    test_data = data.loc[rows_for_test]
    
    return [train_data,validation_data,test_data]

    
#read datasets
blobs = pd.read_csv("blobs250.csv")
moons = pd.read_csv("moons400.csv")

#Normalise the predictor variables in the datasets
blobs["X0"] = (blobs["X0"]-blobs["X0"].mean()) /blobs["X0"].std()
blobs["X1"] = (blobs["X1"]-blobs["X1"].mean()) /blobs["X1"].std()
blobs["X2"] = (blobs["X2"]-blobs["X2"].mean()) /blobs["X2"].std()

moons["X0"] = (moons["X0"]-moons["X0"].mean()) /moons["X0"].std()
moons["X1"] = (moons["X1"]-moons["X1"].mean()) /moons["X1"].std()

#Split as per specification
blobs_train,blobs_validation,blobs_test = train_valid_test(blobs,0.7,0.15,0.15)
moons_train,moons_validation,moons_test = train_valid_test(moons,0.7,0.15,0.15)


#Split each data set into feature set and target set and normalise feature set
#Split data into feature set and target set
blobs_y_train = blobs_train["Class"].to_numpy()
blobs_X_train = blobs_train.drop(columns= "Class").to_numpy()

blobs_y_validate = blobs_validation["Class"].to_numpy()
blobs_X_validate = blobs_validation.drop(columns= "Class").to_numpy()

blobs_y_test = blobs_test["Class"].to_numpy()
blobs_X_test = blobs_test.drop(columns= "Class").to_numpy()


moons_y_train = moons["Class"].to_numpy()
moons_X_train = moons.drop(columns= "Class").to_numpy()

moons_y_validate = moons_validation["Class"].to_numpy()
moons_X_validate = moons_validation.drop(columns= "Class").to_numpy()

moons_y_test = moons_test["Class"].to_numpy()
moons_X_test = moons_test.drop(columns= "Class").to_numpy()



In [100]:
#Time to test

#First test performance using default parameters learning_rate = 0.1 ,max_iter = 10000 ,threshold = 1e-6
#testing on test data, but saving validation set for tuning
learning_rate = 0.1 
max_iter = 10000
threshold = 1e-6

my_logistic = Logistic()
blobs_score = []
moons_score = []
for i in range(50):    #taking an average of 50 predictions
    rand.seed(seed+i)
    #Blobs
    my_logistic.fit(blobs_X_train,blobs_y_train,learning_rate,max_iter,threshold)
    predicted = my_logistic.predict(blobs_X_test)
    blobs_score.append(np.mean(blobs_y_test == predicted)) #caclculates accuracy of predictions

  #Moons
    my_logistic.fit(moons_X_train,moons_y_train,learning_rate,max_iter,threshold)
    predicted = my_logistic.predict(moons_X_test)
    moons_score.append(np.mean(moons_y_test == predicted))     
        
print("blobs accuracy before tuning = " + str(np.round(np.mean(blobs_score),3)) +' : learning rate = 0.1')
print("moons accuracy before tuning = " + str(np.round(np.mean(moons_score),3)) +' : learning rate = 0.1')





#Tune
learning_rates = [0.15,0.125,0.1,0.075,0.05,0.01,0.001] #range of learning rates to test, noting that 0.1 perfomed well originally
#Keeping these constant and focusing on finding optimal learning rate
max_iter = 10000
threshold = 1e-6

blobs_training_score = []
moons_training_score = []


for learning_rate in learning_rates:
    predicted_moons = []
    predicted_blobs = []
    for i in range(15):    #lower averaging to save computation time
        rand.seed(seed+i)
        #Blobs
        my_logistic.fit(blobs_X_train,blobs_y_train,learning_rate ,max_iter ,threshold )
        predicted_blobs.append(my_logistic.predict(blobs_X_validate))
      #Moons
        my_logistic.fit(moons_X_train,moons_y_train,learning_rate ,max_iter ,threshold)
        predicted_moons.append(my_logistic.predict(moons_X_validate))
        
    blobs_training_score.append([learning_rate,np.mean(blobs_y_validate == predicted_blobs)])
    moons_training_score.append([learning_rate,np.mean(moons_y_validate == predicted_moons)])
#save index of learning rate from score... 
index_of_best_blobs_LR = np.argmax([blobs_training_score[i][1] for i in range(len(learning_rates))])
index_of_best_moons_LR = np.argmax([moons_training_score[i][1] for i in range(len(learning_rates))])

#Specify best learning rates for each dataset
best_blobs_LR = learning_rates[index_of_best_blobs_LR]
best_moons_LR = learning_rates[index_of_best_moons_LR]

my_logistic = Logistic()

#Test again with tuned parameters
blobs_score = []
moons_score = []
for i in range(50):    
    rand.seed(seed+i)
    #Blobs
    my_logistic.fit(blobs_X_train,blobs_y_train,best_blobs_LR ,max_iter ,threshold )
    predicted = my_logistic.predict(blobs_X_test)
    blobs_score.append(np.mean(blobs_y_test == predicted))

  #Moons
    my_logistic.fit(moons_X_train,moons_y_train,best_moons_LR,max_iter ,threshold)
    predicted = my_logistic.predict(moons_X_test)
    moons_score.append(np.mean(moons_y_test == predicted))     
        
print("blobs accuracy after tuning = " + str(np.round(np.mean(blobs_score),3)) +' : learning rate = ' + str(best_blobs_LR ))
print("moons accuracy after tuning = " + str(np.round(np.mean(moons_score),3)) +' : learning rate = ' + str(best_moons_LR ))


blobs accuracy before tuning = 1.0 : learning rate = 0.1
moons accuracy before tuning = 0.905 : learning rate = 0.1
blobs accuracy after tuning = 1.0 : learning rate = 0.15
moons accuracy after tuning = 0.903 : learning rate = 0.1


After tuning,I found that the learning rate stayed the same for the moons dataset, making no major difference to the accuracucy , however for moons the learning rate was changed, but the accuracy remained constant. After 50 iterations averaged, the blobs dataset was labelled correctly 100% of the time for a learning rate of 0.1 and the moons data set labelled correctly 90% of the time for a learning rate of 0.1 . Moons is indeed the dataset which is not linearly sepeable and thus would generally be harder to predict than the blobs dataset.

Part 3: Implement and Test a Shallow Neural Network

The class Neural Net takes inputs ; pred_vars,target_var,learning_rate,max_iter and threshold, same as logistic regression.
Extra inputs include , epochs : number of forward and backward passes of all data points in training set
int batch_size : how many points to pass through the forward and back propagation at once
float converge_thresh : The threshold at which point the difference in cross entropy loss between epochs converges to.
str Activation_fn: Speicifies which activation function to use, takes values 'relu','leaky_relu','tanh','logistic', default 'logistic'.
list shape: takes form [a,b,c...] where each a,b,c.. is an integer which represents number of nodes in hidden layer. The length of this represents how many hidden layers, e.g. [10] represents a NN with 1 hidden layer with 10 nodes in it, [10,10,2] a NN with 3 hidden layers of nodes, 10 , 10 and 2 each layer. A logistic regression would be represented as [] which is a 0 hidden layer neural net.
int output: number of output nodes in the final leyer
float regularisation_penalty : penalty term to reduce complexity in the network
        


In [101]:
class Neural_net:
    
    class node: #node class resembling that of logistic class as above, repurposed for Neural Net

        def __init__(self,num_vars,batch_size,activation_fn = 'logistic',regularisation = 0):
            self.num_vars = num_vars # num variables fed into node from previous layer
            self.batch_size = batch_size #overall batch size of NN
            self.activation_fn = activation_fn  #specified activation function
            self.regularisation = regularisation

        def randomise_w_b(self): # initialise random weights and bias for node with float value in range [0,1]
            self.w =[rand.random() for i in range(self.num_vars)]
            self.b =rand.random()

        def update_w_b(self,learning_rate): # update w and b after each epoch
            self.w -= np.multiply(learning_rate,self.delta_w)
            self.b -= learning_rate*self.delta_b

        def update_a(self,val): #method to feed input data points of length batch_size into nodes layer 0
            self.a = val


        def forward(self, layer_before): #forward propagation, taking input from nodes in the layer before the current one
            self.z = [np.sum([np.dot(self.w,[batch[j] for batch in layer_before]) + self.b]) for j in range(self.batch_size)]
            self.a = [self.f(z) for z in self.z]
    
  
        def start_backward(self,y,layer_before): #run this method on each node on the output layer
            #Formulae as per the notes, with added consideration for mini batch
            self.delta_z = [self.a[i] - y[i] for i in range(self.batch_size)]
            self.delta_w = [np.multiply(self.delta_z[ind],[node.a[ind] for node in layer_before]) 
                            for ind in range(self.batch_size)] + np.multiply(self.regularisation/self.batch_size,self.w)
                #The addition term in delta w takes the L2 regularisation parameter into consideration, note default
                #value is 0 so this does not affect the calculation unless the user wishes
            
            #take average of all values in batch to update w and b later
            self.delta_w = [np.mean([self.delta_w[i][j] for i in range(self.batch_size)]) for j in range(self.num_vars)]
            self.delta_b = np.mean(self.delta_z)


        def backward(self,layer_before,layer_after): # Run this method for each in all other layers
            #equations as per the notes, similar setup the start_backwards method
            self.delta_z = np.multiply([self.f_prime(z) for z in self.z],
                                 [np.sum([np.multiply(node.delta_z[ind],node.w) for node in layer_after]) for ind in range(self.batch_size)]) 
            
            self.delta_w = [np.multiply(self.delta_z[ind],[node.a[ind] for node in layer_before])
                            for ind in range(self.batch_size)] + np.multiply(self.regularisation/self.batch_size,self.w)

            self.delta_w = [np.mean([self.delta_w[i][j] for i in range(self.batch_size)]) for j in range(self.num_vars)]
            self.delta_b = np.mean(self.delta_z)

            
        def start_prediction(self,inp): # similar to the update a method as above.
            #feeds the x values to be predicted into the nodes in layer 0 of the network
            self.prediction = inp

        def predict(self,x,return_probs=False, hard_threshold = 0.5):
            #passes the x values to be predicted through the network to return a binary value if return probs = false
            #else return probability of class 1
            y_hat = self.f(np.dot(self.w,x) + self.b)
            if  return_probs == False:
                self.prediction = int(y_hat >= hard_threshold)
            else:
                self.prediction = y_hat
            return self.prediction
        
        def f(self,z): #activation function which checks which fn to use
            if self.activation_fn == 'relu':
                f_z = self.ReLU(z)
            elif self.activation_fn == 'leaky_relu':
                f_z = self.leakyReLU(z)
            elif self.activation_fn == 'tanh':
                f_z = self.tanh(z)
            else:
                f_z = self.sigmoid(z)
            
            return f_z
        
        def f_prime(self,z): #derivative of activation function
            if self.activation_fn == 'relu':
                fprime_z = self.ReLU_prime(z)
            elif self.activation_fn == 'leaky_relu':
                fprime_z = self.leakyReLU_prime(z)
            elif self.activation_fn == 'tanh':
                fprime_z = self.tanh_prime(z)
            else:
                fprime_z = self.sigmoid_prime(z)
            
            return fprime_z        
        
        #Different functions as found in the notes
        def sigmoid(self,z):
            return 1/(1+np.exp(-z))
        
        def sigmoid_prime(self,z):
            return self.sigmoid(z)*(1-self.sigmoid(z))
        
        def tanh(self,z):
            return (np.exp(z)-np.exp(-z))/(np.exp(z)+ np.exp(-z))
        
        def tanh_prime(self,z):
            return 1 - self.tanh(z)**2
        
        def ReLU(self,z):
            return 0 if z<0.5 else z
    
        def ReLU_prime(self,z):
            return 0 if z<0.5 else 1
        
        def leakyReLU(self,z):
            return 0.01*z if z<0.5 else z
    
        def leakyReLU_prime(self,z):
            return 0.01 if z<0.5 else 1





    def fit(self, pred_vars,target_var,learning_rate = 0.1 ,max_iter = 10000,converge_thresh = 0.01 ,activation_fn = 'logistic',
            epochs = 5,batch_size = 32,shape = [],output = 1,regularisation_penalty = 0):
        #providing default values for all parameters, except the data
        #similar setup to logistic regression
        self.x = pred_vars
        self.y = target_var 
        self.num_vars = len(pred_vars[0])
        self.num_points = len(target_var)
        self.output = output # save number of outputs desired
    
        #The shape of the NN is built from , layer 0, which has number of nodes equal to number of input features,
        #hidden layers as specified by user, and the output layer populated with whatever the user specifies.
        shape =  [self.num_vars] + shape + [output]

        self.length = len(shape) # length of Neural net (input + #hidden + output layers)
        
        #input vars represents how many nodes from the previous layer are used to feed forward into the current layer,
        #layer 0 has 0 nodes feeding into them, hence the 0. Then append the shape list as a layer 1 node would have all
        #layer 0's nodes feeding into it.
        input_vars = [0] + shape
        
        #Build the neural net as specified, placing a node class at each place a node is expected
        self.NN = [[self.node(num_vars = input_vars[j],batch_size = batch_size,activation_fn = activation_fn,
                              regularisation = regularisation_penalty) for i in range(shape[j])] for j in range(self.length) ]
        
        [node.randomise_w_b() for layer in self.NN for node in layer ] #initiate weights and biases for each node in NN
        iters = 0  #initialising the iteration number
        current_epoch = 1 #Starting 1 as I use >= in the while statement. If epochs is = 5 then 
                          #there will be 5 epochs of the NN. Begining at 1 means the systems prints the correct
                          #current epoch as below 
        indexes = [i for i in range(self.num_points)] #List of indexes of the training points. This will be sampled from
                                # without replacement, then once it empties, one epoch finishes and it is replenished again
        prev_cross_ent_loss = 100 #initialise a high value for loss
        while iters < max_iter and epochs >= current_epoch: # stop learning if max iterations or max epochs reached
            if len(indexes) < batch_size: #if list of indexes is less than the points required for current batch
                if current_epoch == epochs:
                  #print("Finished Training")  #uncomment out if desired
                  break
                #print("epoch # " + str(current_epoch) +" complete")  #uncomment out if desired
                sample = indexes #sample the remaining indexes 
                to_keep = sample # as these points are from this epochs, we dont wish to exclude them from the next epoch
                indexes = [i for i in range(self.num_points)] #replenish the index list
                current_epoch += 1 #next epoch officially started
                
                s_i = self.predict(x_i,return_probs=True) #test learned model performance so far on a set of size batch size
                cross_ent_loss = -np.sum(np.multiply(y_i,np.log(s_i)))/batch_size #calculate the corss entropy loss
                if prev_cross_ent_loss - cross_ent_loss <converge_thresh : #check if system has learned enough
                    #if the system has achieved the convergence threshold, or has got worse, stop learning.
                    #print("convergence reached after " + str(current_epoch) + " epochs")
                    break
                prev_cross_ent_loss = cross_ent_loss # update loss
                
                while len(sample) < batch_size: #now to top up sample until it has length batch size
                    to_add = rand.sample(indexes,batch_size- len(sample)) # sample numbers from indexes of length required
                                                                        # to top up sample to length batch size
                    [sample.append(num) for num in to_add if num not in sample] #check if the sampled number is already in sample,
                                                                            #if not append to list, if so resample from indexes
                to_remove = [sample[i] for i in range(batch_size) if sample[i] not in to_keep]
                #points we wish to remove from this epoch as they are now being sampled from this current epoch
                
            else: #if length of indexes is big enough for the batch size
                sample = rand.sample(indexes,batch_size) #take a sample without replacement
                to_remove = sample #points to remove from the indexes list
        
            indexes = [indexes[i] for i in range(len(indexes)) if indexes[i] not in to_remove] #remove the sampled points from 
                                            #indexes so the system doesn't re select them in this epoch

            x_i = [self.x[rand_num] for rand_num in sample] #choose random batch as per the list of sampled indexes
            y_i = [self.y[rand_num] for rand_num in sample]
            
            #feed input points into nodes in layer 0
            [self.NN[0][ind].update_a([x_i[b][ind] for b in range(batch_size)]) for ind in range(self.num_vars)]
            #propagate forward through the system 
            [[node.forward([prev_node.a for prev_node in self.NN[i-1]]) for node in self.NN[i]]  for i in range(1,self.length) ]
            iters += 1 # another pass through the system complete
            
            #Begin back propagation with all nodes in final layer
            [self.NN[-1][i].start_backward(y_i,self.NN[-2]) for i in range(output)]

      
            for layer_ind in range((self.length-2),0,-1): #traverse backwards through net not including last layer
                next_layer = self.NN[layer_ind+1] #taking the layer either side of the current one
                layer_before = self.NN[layer_ind-1]
                for node in self.NN[layer_ind]: #backpropagate for each node in the current layer
                    node.backward(layer_before,next_layer)
            
            iters += 1 # another pass through the system complete
            
            #update weights and biases using learning rate specified
            [node.update_w_b(learning_rate) for layer in self.NN[1:len(self.NN)] for node in layer]



    def predict(self,x_data,return_probs = False):
        #takes a list of data points of same number of features as taken by the fit method
        #Can specify to return category labels, or probability of label
        prediction = []
    
        for point in x_data: #iterate through each point given to be predicted
            [self.NN[0][i].start_prediction(point[i]) for i in range(len(point))] #feed input points into nodes in layer 0
            [[node.predict([prev_node.prediction for prev_node in self.NN[i-1]],return_probs=return_probs) for node in self.NN[i]]  for i in range(1,len(self.NN))]
            #feed the points forward through the network and output a prediction
            prediction.append([self.NN[-1][i].prediction for i in range(self.output)]) #append the prediction made in every output node

        return prediction #return the list of predictions

In [102]:

NN = Neural_net()
#Testing the NN with 0 hidden layers,the default logistic activation function and a batch size of one to emulate a logistic regression
#using stochastic gradient descent
blobs_score = []
moons_score = []
learning_rate = 0.1
max_iter = 10000
epochs = 20
for i in range(50):    #taking an average of 50 predictions
    rand.seed(seed+i) #change the random seed at each iteration to expose learning to different scenarios but still maintain reproducibility
    #Blobs
    NN.fit(blobs_X_train,blobs_y_train,learning_rate ,max_iter ,epochs = epochs,batch_size = 1,shape = [] )
    predicted = NN.predict(blobs_X_test)
    blobs_score.append(np.mean(blobs_y_test == predicted))
  
    NN.fit(moons_X_train,moons_y_train,learning_rate,max_iter ,epochs = epochs,batch_size = 1,shape = [] )
    predicted = my_logistic.predict(moons_X_test)
    moons_score.append(np.mean(moons_y_test == predicted))     
  
        
print("blobs accuracy for Neural net immitating a logistic regression = " + str(np.round(np.mean(blobs_score),3)) +' : learning rate = 0.1')
print("moons accuracy for Neural net immitating a logistic regression = " + str(np.round(np.mean(moons_score),3)) +' : learning rate = 0.1')


blobs accuracy for Neural net immitating a logistic regression = 0.582 : learning rate = 0.1
moons accuracy for Neural net immitating a logistic regression = 0.9 : learning rate = 0.1


Although this should behave like a logistic regression, the results are radically different. The big difference which gives this result is that the basic logistic regression evaluates its performance at every iteration, however the neural net structure evaluates at every epoch, and it also uses a cross entropy loss function which differs from that implemented for the logistic regression.
The results are a slight improvement for the non-linearly seperable moons dataset, however a radically worse prediction score for the linearly seperable blobs dataset. My theory is that the NN is overfitting the blobs data as it learns for many more iterations than the logistic regression before being evaluated given a certain threshold. The moons dataset accuracy has improved as it requires a more complicated model to fit it, which the NN achieves.

In [103]:
NN = Neural_net()

#Now train the neural net with with hidden layer with varying number of nodes in hidden layer
#Using a sigmoid activation function and keeping everything else constant

max_iter = 10000
learning_rate = 0.1 
max_iter = 10000 
activation_fn = 'logistic'
epochs = 20
shapes = [[10],[25],[50],[75],[100],[200]] #various different shapes to try


blobs_training_score = []
moons_training_score = []
for shape in shapes: # loop through all shapes to test
    blobs_score = []
    moons_score = []
    for i in range(10): #average over 10 iterations of the one shape
        rand.seed(seed+i)
        #Blobs
        NN.fit(blobs_X_train,blobs_y_train,learning_rate,max_iter,epochs = epochs,batch_size = 1,shape = shape)
        predicted = NN.predict(blobs_X_validate)
        actual = blobs_y_validate
        blobs_score.append(np.mean(actual == predicted))
        
        #Moons
        NN.fit(moons_X_train,blobs_y_train,learning_rate,max_iter,epochs = epochs,batch_size = 1,shape = shape)
        predicted = NN.predict(moons_X_validate)
        actual = moons_y_validate
        moons_score.append(np.mean(actual == predicted))
    blobs_training_score.append([np.mean(blobs_score),shape[0]])  #average the accuracy of 10 iterations and save the
    moons_training_score.append([np.mean(moons_score),shape[0]])  #num nodes used 
    
#save index of highest score to extract the number of nodes used
index_of_best_blobs_numnodes = np.argmax([blobs_training_score[i][0] for i in range(len(shapes))])
index_of_best_moons_numnodes = np.argmax([moons_training_score[i][0] for i in range(len(shapes))])

#Specify best shape for each dataset
best_blobs_shape = shapes[index_of_best_blobs_numnodes]
best_moons_shape = shapes[index_of_best_moons_numnodes]

#Store data in a pandas dataframe to visualise
blobs_training = pd.DataFrame(blobs_training_score,columns = ["accuracy","num_nodes"]) 
moons_training = pd.DataFrame(moons_training_score,columns = ["accuracy","num_nodes"]) 

print("blobs training scores")
print(blobs_training)
print("moons training scores")
print(moons_training)

blobs_score = []
moons_score = []
#using best shape from training, test on test set
for i in range(50):    #taking an average of 50 predictions
    rand.seed(seed+i)
    #Blobs
    NN.fit(blobs_X_train,blobs_y_train,learning_rate ,max_iter ,epochs = epochs,batch_size = 1,shape = best_blobs_shape )
    predicted = NN.predict(blobs_X_test)
    blobs_score.append(np.mean(blobs_y_test == predicted))
  
    NN.fit(moons_X_train,moons_y_train,learning_rate,max_iter ,epochs = epochs,batch_size = 1,shape = best_moons_shape )
    predicted = my_logistic.predict(moons_X_test)
    moons_score.append(np.mean(moons_y_test == predicted))     
  
        
print("blobs accuracy for Neural net 1 hidden layer and "  +str(best_blobs_shape[0]) + " nodes = " + str(np.round(np.mean(blobs_score),3)))
print("moons accuracy for Neural net with 1 hidden layer and "+str(best_moons_shape[0]) + " nodes = "  + str(np.round(np.mean(moons_score),3)))




blobs training scores
   accuracy  num_nodes
0  0.517896         10
1  0.517896         25
2  0.517896         50
3  0.517896         75
4  0.517896        100
5  0.517896        200
moons training scores
   accuracy  num_nodes
0  0.501778         10
1  0.521667         25
2  0.526667         50
3  0.526667         75
4  0.527333        100
5  0.505222        200
blobs accuracy for Neural net 1 hidden layer and 10 nodes = 0.582
moons accuracy for Neural net with 1 hidden layer and 100 nodes = 0.9


Interestingly, no improvement was gained from including a hidden layer. I also note that the performance on the moons data set was much higher when tested on the holdout data over the tests performed on the validation set as the data was divided completely randomly.. The only reason I can see why this might have occured is because only 10 iterations were used during training and 50 were used for the testing, perhaps some randomly generated weights gave a better starting point than others.

In [104]:
NN = Neural_net()

#Out of curiosity I will try different acivation functions with the Neural Net, although
#I don't expect the others to perform as well at the logistic activation function


max_iter = 10000
learning_rate = 0.1 
max_iter = 10000 
activation_fn = ['relu','leaky_relu','tanh','logistic']
epochs = 20


blobs_training_score = []
moons_training_score = []
for activation in activation_fn: # loop through all shapes to test
    blobs_score = []
    moons_score = []
    for i in range(10): #average over 10 iterations of the one shape
        rand.seed(seed+i)
        #Blobs
        NN.fit(blobs_X_train,blobs_y_train,learning_rate,max_iter,epochs = epochs,batch_size = 1,shape = best_blobs_shape, activation_fn = activation)
        predicted = NN.predict(blobs_X_validate)
        actual = blobs_y_validate
        blobs_score.append(np.mean(actual == predicted))
        
        #Moons
        NN.fit(moons_X_train,blobs_y_train,learning_rate,max_iter,epochs = epochs,batch_size = 1,shape = best_moons_shape, activation_fn = activation)
        predicted = NN.predict(moons_X_validate)
        actual = moons_y_validate
        moons_score.append(np.mean(actual == predicted))
    blobs_training_score.append([np.mean(blobs_score),activation])  #average the accuracy of 10 iterations and save the
    moons_training_score.append([np.mean(moons_score),activation])  #activation function used 
    
#save index of highest score to extract the number of nodes used
index_of_best_blobs_function = np.argmax([blobs_training_score[i][0] for i in range(len(activation_fn))])
index_of_best_moons_function = np.argmax([moons_training_score[i][0] for i in range(len(activation_fn))])

#Specify best shape for each dataset
best_blobs_function = activation_fn[index_of_best_blobs_function]
best_moons_function = activation_fn[index_of_best_moons_function]

#Store data in a pandas dataframe to visualise
blobs_training = pd.DataFrame(blobs_training_score,columns = ["accuracy","activation_function"]) 
moons_training = pd.DataFrame(moons_training_score,columns = ["accuracy","activation_function"]) 

print("blobs training scores")
print(blobs_training)
print("moons training scores")
print(moons_training)

blobs_score = []
moons_score = []
#using best shape from training, test on test set
for i in range(50):    #taking an average of 50 predictions
    rand.seed(seed+i)
    #Blobs
    NN.fit(blobs_X_train,blobs_y_train,learning_rate ,max_iter ,epochs = epochs,batch_size = 1,shape = best_blobs_shape ,activation_fn = best_blobs_function )
    predicted = NN.predict(blobs_X_test)
    blobs_score.append(np.mean(blobs_y_test == predicted))
  
    NN.fit(moons_X_train,moons_y_train,learning_rate,max_iter ,epochs = epochs,batch_size = 1,shape = best_moons_shape, activation_fn = best_moons_function )
    predicted = my_logistic.predict(moons_X_test)
    moons_score.append(np.mean(moons_y_test == predicted))     
  
        
print("blobs accuracy for Neural net 1 hidden layer, "  +str(best_blobs_shape[0]) + " nodes and function "+ best_blobs_function +" = " + str(np.round(np.mean(blobs_score),3)))
print("moons accuracy for Neural net with 1 hidden layer, "+str(best_moons_shape[0]) + " nodes and function " + best_moons_function + " = "  + str(np.round(np.mean(moons_score),3)))




  cross_ent_loss = -np.sum(np.multiply(y_i,np.log(s_i)))/batch_size #calculate the corss entropy loss
  cross_ent_loss = -np.sum(np.multiply(y_i,np.log(s_i)))/batch_size #calculate the corss entropy loss
  if prev_cross_ent_loss - cross_ent_loss <converge_thresh : #check if system has learned enough
  self.delta_w = [np.multiply(self.delta_z[ind],[node.a[ind] for node in layer_before])
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  for ind in range(self.batch_size)] + np.multiply(self.regularisation/self.batch_size,self.w)
  self.delta_w = [np.multiply(self.delta_z[ind],[node.a[ind] for node in layer_before])
  [np.sum([np.multiply(node.delta_z[ind],node.w) for node in layer_after]) for ind in range(self.batch_size)])
  self.delta_z = np.multiply([self.f_prime(z) for z in self.z],
  for ind in range(self.batch_size)] + np.multiply(self.regularisation/self.batch_size,self.w)
  self.z = [np.sum([np.dot(self.w,[batch[j] for batch in layer_before]) + self.b]) for j in range(s

blobs training scores
   accuracy activation_function
0  0.547042                relu
1  0.481081          leaky_relu
2  0.594595                tanh
3  0.517896            logistic
moons training scores
   accuracy activation_function
0  0.533333                relu
1  0.533333          leaky_relu
2  0.475444                tanh
3  0.520000            logistic


  cross_ent_loss = -np.sum(np.multiply(y_i,np.log(s_i)))/batch_size #calculate the corss entropy loss
  cross_ent_loss = -np.sum(np.multiply(y_i,np.log(s_i)))/batch_size #calculate the corss entropy loss
  cross_ent_loss = -np.sum(np.multiply(y_i,np.log(s_i)))/batch_size #calculate the corss entropy loss
  if prev_cross_ent_loss - cross_ent_loss <converge_thresh : #check if system has learned enough
  self.delta_w = [np.multiply(self.delta_z[ind],[node.a[ind] for node in layer_before])
  [np.sum([np.multiply(node.delta_z[ind],node.w) for node in layer_after]) for ind in range(self.batch_size)])
  self.delta_z = np.multiply([self.f_prime(z) for z in self.z],
  for ind in range(self.batch_size)] + np.multiply(self.regularisation/self.batch_size,self.w)
  for ind in range(self.batch_size)] + np.multiply(self.regularisation/self.batch_size,self.w)
  self.delta_w = [np.multiply(self.delta_z[ind],[node.a[ind] for node in layer_before])
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

blobs accuracy for Neural net 1 hidden layer, 10 nodes and function tanh = 0.303
moons accuracy for Neural net with 1 hidden layer, 100 nodes and function relu = 0.9


The errors indicate more consideration needs to be taken for using different activation functions with the cross entropy loss function, typically a softmax layer comes before the cross entropy loss calculation, however as the prediction values are binary I chose to just include the probability as output by the activation functions and not worry about splitting into two seperate probabilities for the softmax layer.
Through tuning, the blobs validation set yielded best results when using a tanh function yet when applied to the test set performed horribly with 30% accuracy, this would be expected as a tan function assumes the data not linear, which blobs is (linearly seperable that is). Classic case of overfitting
For the moons dataset, relu, leaky relu and logistic all performed much the same on the validation dataset, and when using a relu function on the test dataset yielded the same accuracy score 90% as before.

Part 4: Challenging Task

In [128]:
#Load Cifar dataset :  entire chunk taken from LoadCIFAR10.ipynb supplied by Dr. Michael Madden

# This function taken from the CIFAR website

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Loaded in this way, each of the batch files contains a dictionary with the following elements:
#   data -- a 10000x3072 numpy array of uint8s. Each row of the array stores a 32x32 colour image. 
#           The first 1024 entries contain the red channel values, the next 1024 the green, and the final 1024 the blue. 
#           The image is stored in row-major order, so that the first 32 entries of the array are the red channel values 
#           of the first row of the image.
#   labels -- a list of 10000 numbers in the range 0-9. 
#             The number at index i indicates the label of the ith image in the array data.

def loadbatch(batchname):
    folder = 'cifar-10-batches-py'
    batch = unpickle(folder+"/"+batchname)
    return batch

def loadlabelnames():
    folder = 'cifar-10-batches-py'
    meta = unpickle(folder+"/"+'batches.meta')
    return meta[b'label_names']

import matplotlib.pyplot as plt

def visualise(data, index):
    # MM Jan 2019: Given a CIFAR data nparray and the index of an image, display the image.
    # Note that the images will be quite fuzzy looking, because they are low res (32x32).

    picture = data[index]
    # Initially, the data is a 1D array of 3072 pixels; reshape it to a 3D array of 3x32x32 pixels
    # Note: after reshaping like this, you could select one colour channel or average them.
    picture.shape = (3,32,32) 
    
    # Plot.imshow requires the RGB to be the third dimension, not the first, so need to rearrange
    picture = picture.transpose([1, 2, 0])
    plt.imshow(picture)
    plt.show()

batch1 = loadbatch('data_batch_1')
print("Number of items in the batch is", len(batch1))

# Display all keys, so we can see the ones we want
print('All keys in the batch:', batch1.keys())


data = batch1[b'data']
labels = batch1[b'labels']
print ("size of data in this batch:", len(data), ", size of labels:", len(labels))
print (type(data))
print(data.shape)

names = loadlabelnames()




Number of items in the batch is 4
All keys in the batch: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
size of data in this batch: 10000 , size of labels: 10000
<class 'numpy.ndarray'>
(10000, 3072)


In [129]:
#index for RGB colours respectively
red = [i for i in range(1024)]
green = [i + 1024 for i in range(1024)]
blue = [i + 2048 for i in range(1024)]

#Segment data into only one colour and normalising features in range [0,1] by dividing by the max pixel value 255
normalised_red_data = [picture[red]/255 for picture in data]
normalised_green_data = [picture[green]/255 for picture in data]
normalised_blue_data = [picture[blue]/255 for picture in data]

data = normalised_red_data #selecting red channel to test on

classes = [b'dog',b'cat'] #Given classes to differentiate between

#subset total dataset into just the two classes as above and label 0 if dog and 1 if cat
labelled_subset = [[data[i],0 if names[labels[i]] == b'dog' else 1]  for i in range(len(data)) if names[labels[i]] in classes]

#create a dataframe from this subset
cifar = pd.DataFrame(labelled_subset,columns = ["data","Class"]) 


#split the data by percentage
cifar_train,cifar_validation,cifar_test = train_valid_test(cifar,0.7,0.15,0.15)


cifar_y_train = cifar_train["Class"].to_numpy()
cifar_X_train = cifar_train['data'].to_numpy()

cifar_y_validate = cifar_validation["Class"].to_numpy()
cifar_X_validate = cifar_validation["data"].to_numpy()

cifar_y_test = cifar_test["Class"].to_numpy()
cifar_X_test = cifar_test["data"].to_numpy()



epochs = 20
NN = Neural_net()
cifar_score = []
for i in range(2):
    rand.seed(seed+i)
    NN.fit(cifar_X_train,cifar_y_train,learning_rate = 0.1,max_iter = 5000 ,epochs = epochs,batch_size = 1,shape = [50])
    predicted = NN.predict(cifar_X_test)
  
    actual = cifar_y_test
    cifar_score.append(np.mean(actual == predicted))


print("Cifar accuracy with 1 hidden layer and 50 nodes = " + str(np.mean(cifar_score) ))


Cifar accuracy with 1 hidden layer and 50 nodes = 0.4794520547945205


48% accuracy for differentiating between cats and dogs. Due to the number of features I had to reduce the number of tests to average over in order to let the system complete in a manageable time frame. Taking this into consideration, I will not tune the parameters as my sysem could not handle that computation.

Part 5: Deep Learning Enhancements

The enhancements which my Neural net supports are as follows:
1. Any number of hidden layers and customizable nodes per layer, including output layer
2. Mini batch gradient descent
3. L2 Regularization
4. Supports 4 different acitvation functions (minor)

In [130]:
#Experimenting with common values for each hyperparameter
LR = 0.1
function = 'logistic'
batch = 32  #batch size must be small realtive to dataset size, total dataset size is 2000, therefore 32 seems appropriate
shape = [50,50] #including a second hidden layer
penalty = 0.1 #including a L2 regularisation penalty value
epochs = 20
NN = Neural_net()
cifar_score = []
for i in range(2):
    rand.seed(seed+i)
    NN.fit(cifar_X_train,cifar_y_train,learning_rate = LR ,max_iter = 5000 ,activation_fn = function,
                   epochs = epochs,batch_size = batch,shape = shape,output = 1,regularisation_penalty = penalty)
    predicted = NN.predict(cifar_X_test)
  
    actual = cifar_y_test
    cifar_score.append(np.mean(actual == predicted))


print("Cifar accuracy with 2 hidden layers,50 nodes each, batch size of 32, learning rate and regularisation penaly both equal to 0.1 = " + str(np.mean(cifar_score) ))


Cifar accuracy with 2 hidden layers,50 nodes each, batch size of 32, learning rate and regularisation penaly both equal to 0.1 = 0.5205479452054794


By increasing the batch size from 1 to 32, including an extra hidden layer of 50 nodes and including an L2 regularisation penalty term , the accuracy of the prediction increased by 4%. Perhaps with further tuning, and a deeper network this score would increase further. I contemplate this in the next chunk, but don't actually run it for computational complexity reasons.

In [None]:
#Run this cell only on a gpu , will take a very long time, 10's of hours I estimate
#It iterates through multiple common values of every hyperparameter  and saves the training results in a csv file
#Of course this would possibly lead to overfitting on the validation set, however by saving all the results,
#one could see the tradeoff in accuracy and complexity.
#I tried running this overnight on google collab however, collab timed out before completion theredfore I leave 
#only as an idea.

import csv 
NN = Neural_net()

activation_functions = ['relu','leaky_relu','tanh','logistic']
learning_rates = [0.15,0.1,0.05,0.01,0.001]
regularisation_penalties = [10,1,0.1,0.001]
batch_sizes = [16,32,64]

num_hidden_layers = [1,2,3,4]
num_nodes_per_layer = [10,25,50,75,100]

shapes = []
for hidden_layer in num_hidden_layers:
    for nodes in num_nodes_per_layer:
        shapes.append([nodes]*hidden_leys) #various shapes
        
tuning_score = []
for function in activation_functions:
    for LR in learning_rates:
        for penalty in regularisation_penalties:
            for batch in batch_sizes:
                for shape in shapes:
                    
                NN.fit(cifar_X_train,cifar_y_train,learning_rate = LR ,max_iter = 10000 ,activation_fn = function,
                   epochs = 20,batch_size = batch,shape = shape,output = 1,regularisation_penalty = penalty)
                prediction = NN.predict(cifar_X_validate)
                accuracy = np.mean(prediction,cifar_y_validate)
                tuning_Score.append([accuracy,function,LR,penalty,batch,shape])
                
with open('tuning_data.csv', 'w') as f: 
      
    # save data as csv
    write = csv.writer(f) 
      
    write.writerows(tuning_score) 
    
index_of_best_highest_accuarcy = np.argmax([moons_training_score[i][0] for i in range(len(tuning_Score))])   

#extract values to use for testing
score,function,Learning_rate,regularisation_penalty,batch_size,shape = tuning_Score[index_of_best_highest_accuarcy]

test_score = []
for i in range(20):
    NN.fit(cifar_X_train,cifar_y_train,learning_rate = Learning_rate ,max_iter = 10000 ,activation_fn = function,
                   epochs = 50,batch_size = batch_size,shape = shape,output = 1,regularisation_penalty = regularisation_penalty)
    predicted = my_logistic.predict(moons_X_test)
    test_score.append(np.mean(moons_y_test == predicted)) 

print('cifar accuracy after tuning = ' + str(np.mean(test_score)))
print("activation function = " +str(function) + "; Learning_rate = " 
      + str(learning_rate) + "; penalty = " +str(regularisation_penalty) + "; batch size = " 
      + str(batch_size) + "; shape = " + str(shape) )