# Luke Hayes - Deep Learning Assignment 1 - 14498098

In [487]:
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import pandas as pd
import numpy as np
import random
# Display plots inline and change default figure size
%matplotlib inline
from sklearn.model_selection import train_test_split


**Part 1 - Create Logistic Regression Algorithm** 

The get_data function below is used to take in csv file with the data and parse it into the format that we require. It does the splitting of the train, test and validation data and also splits up the attributes and labels.

In [492]:
def get_data(df):
    #HERE WE SPLIT THE TRAINING DATA 
    train, test = train_test_split(df, test_size=0.3)
    val, test = train_test_split(test, test_size=0.5)

    #ISOLATE JUST THE VALUES AND NOT THE COLUMN HEADERS
    train_set = train.values
    test_set = test.values
    val_set = val.values 

    #ISOLATE THE Y VALUES AND DELETE Y VALUES SO WE CAN ISOLATE ATTRIBUTES
    Ytrain = train['Class'].values
    Ytest = test['Class'].values
    Yval = val['Class'].values

    del train['Class']   
    del test['Class']   
    del val['Class']   

    #GET THE VALUES OF THE X VALUES
    Xtrain = train.values     
    Xtest = test.values     
    Xval = val.values 
    
    x, num_attr = Xval.shape

    return num_attr, Xtrain, Xtest, Xval, Ytrain, Ytest, Yval, train_set, test_set, val_set

The next number of functions are used to run logistic regression on the data.

In [493]:
def logistic_regressor(num_atr, max_iter, train, alpha, threshold):
    
    #INITIALISE THE WEIGHTS TO BE SMALL VALUES CLOSE TO 0
    w = np.random.normal(0,0.01,size=(num_atr))
    b = np.random.normal(0,0.01,size=(1))
    j_cur = 0
    
    delta_w = np.random.normal(0,0,size=(num_atr))

    #LOOP FOR MAX ITERATIONS
    for i in range(max_iter):
        
        #GET A RANDOME SAMPLE
        res = random.sample(list(train), 1)[0]
        #GET THE Y VALUE FOR THAT SAMPLE
        yval = res[-1]
        #GET THE X VALUES OF THE SAMPLE
        x = np.delete(res,-1,0)
        
        #GET Y HAT - THE PREDICTED VALUE 
        y_hat = get_yhat(w,x,b)
        
        #PREVIOUS VALUE IS EQUAL TO THE LAST ITERATION VALUE
        j_prev = j_cur
        #GET THE COST FUNCTION VALUE
        j_cur = -(yval*np.log(y_hat)+(1-yval)*np.log(1-y_hat))
        
        #IF THE CHANGE IN THE COST FUNCTION VALUE IS LESS THAN THE THRESHOLD FINISH THE FUNCTION
        if(i!=0 and abs(j_prev - j_cur) <= threshold):
            w = np.append(w,b)
            return w
        
        #GET DELTA W AND DELTA B
        for j in range(len(w)):
            delta_w[j] = (y_hat - yval)* x[j] 
        delta_b = y_hat - yval
        
        #UPDATE THE WEIGHTS AND BIAS USING DELTA W AND DELTA B AND THE LEARNING RATE
        for k in range(len(w)):
            w[k] -= alpha * delta_w[k]
            #print(w)
        b -= alpha * delta_b
        
    #RETURN THE WEIGHTS
    return w

def get_yhat(w,x,b):
    #GET THE DOT PRODUCT OF THE WEIGHTS AND THE X VALUES AND ADD THE BIAS
    val = np.dot(w,x)
    z = val + b
    #GET THE SIGMOID VALUE OF THE OUTPUT
    y = 1/(1+np.exp(-z))
    return y


def runLogisticRegression(num_attr, max_iter, train, test, learn_rate, thresh):

    x = logistic_regressor(num_attr, 10000000, train, 0.001, 0.000001)

    #GET THE TRAINING VALUES AND THE BIAS FROM THE INPUT DATA
    train_bias = x[-1]
    train_weights = np.delete(x,-1,0)

    correct = 0
    total = 0
    
    #LOOP THROUGH ALL TEST SAMPLES
    for t in test:
        
        #GET THE Y VALUE AND X VALUES 
        test_yval = t[-1]
        x = np.delete(t,-1,0)
        
        #MULTIPLY THE X VALUES BY THE TRAINING WEIGHTS 
        test_val = np.dot(train_weights,x)
        #ADD THE BIAS
        testz = test_val + train_bias
        #GET THE SIGMOID VALUE
        test_y_hat = 1/(1+np.exp(-testz))

        #print("This is the prediected value %f and this is the actual value %d" %(test_y_hat ,test_yval))
        if (test_y_hat >= 0.5):
            test_y_hat = 1
        else:
            test_y_hat = 0
        if (test_y_hat == test_yval):
            correct = correct + 1
            total = total + 1
        else:
            total = total + 1

    accuracy = (correct/total)
    return accuracy

In [490]:
max_iter = 10000000
learn_rate = 0.001
thresh = 0.000001

**Part 2 - Test Logistic Regression Algorithm**

Here we then run logistic regression on the linearly seperable and non-linearly seperable data.

In [494]:
df = pd.read_csv("C:/Users/lhaye/Downloads/blobs250.csv")

num_attr, Xtrain, Xtest, Xval, Ytrain, Ytest, Yval, train_set, test_set, val_set = get_data(df)
accuracy = runLogisticRegression(num_attr, max_iter, train_set, test_set, learn_rate, thresh)

print("Accuracy for the linearly seperable dataset: ", accuracy)
print("")
print("")


df = pd.read_csv("C:/Users/lhaye/Downloads/moons400.csv")

num_attr, Xtrain, Xtest, Xval, Ytrain, Ytest, Yval, train_set, test_set, val_set = get_data(df)
accuracy = runLogisticRegression(num_attr, max_iter, train_set, test_set, learn_rate, thresh)

print("Accuracy for the non-linearly seperable dataset: ", accuracy)

Accuracy for the linearly seperable dataset:  1.0


Accuracy for the non-linearly seperable dataset:  0.8333333333333334


As we can see the Logistic regressor performs with an accuracy of 100% on the test set of the linearly seperable data. The algorithm surprisingly achieves a very high accuracy of 83% on the non-linearly seperable dataset. This is an excellent result.

**Part 3 - Shallow Neural Network**

Next we have the implementation of the shallow neural network.

In [1033]:
"""
USE NUMPY VECTORIZATION AS FAR AS POSSIBLE TO SPEED UP BOTH TRAINING AND TESTING
"""
#SIGMOID FUNCTION
def sig(x):
    return 1 / (1 + np.exp(-x))

#SIGMOID DERIVATIVE FUNCTION 
def sig_deriv(x):
    return np.multiply(sig(x), 1 - sig(x))

#NEURAL NETWORK TRAINING
def training(num_attr, max_iter, attribute, label, learn_rate, thresh):
    
    #INITIALIZATION     
    hid_size = 600
    input_size = num_attr
    out_size = 1
    
    #INITIALISE WEIGHTS AND BIAS
    hid_w = np.random.uniform(-1,1,size=(input_size , hid_size))
    out_w = np.random.uniform(-1,1,size=(hid_size))
    b1 = np.random.uniform(-0.5,0.5,size=(hid_size))
    b2 = np.random.uniform(-0.5,0.5,size=(out_size))
    
    j_cur = 0
    epoch = 0
    cost_total = 0
    
    for i in range(max_iter):
        
        #GET INDEX OF RANDOM TRAINING SAMPLE
        select = random.randint(0, len(attribute) - 1)

        #GET THE TRAINING VALUES AND THE OUTPUT(Y) VALUE
        x = attribute[select]
        yval = label[select]

        ######################
        #FORWARD PROPOGATION
        ######################
    
    
        hid_node_z = np.dot(x,hid_w)
        hid_node_z = np.add(hid_node_z, b1)
        hid_node = sig(hid_node_z)
                        
        output = np.dot(hid_node, out_w) + b2
        
        #GET OUTPUT
        y_hat = sig(output)
                
        j_prev = j_cur
        
        #GET COST 
        j_cur = -(yval*np.log(y_hat)+(1-yval)*np.log(1-y_hat))
        
        #APPEND IT TO A TOTAL FOR THE EPOCH 
        cost_total += j_cur

        #IF WE HIT 1675 ITERATIONS WE HAVE DONE ONE EPOCH
        if(i%1675==0):

            epoch = epoch + 1
            print("Cost function for this epoch is ", (cost_total/1675))
            cost_total = 0

        #IF WE REACH MAX EPOCHS OR THE THRESHOLD WE RETURN
        if(i!=0 and abs(j_prev - j_cur) <= thresh or epoch==40):
            return hid_w, out_w, b1, b2, hid_size
        
        
        
        ##################
        #BACK PROPOGATION
        ##################
        delta_z_out = y_hat - yval
        #MIGHT NEED TO REMOVE THIS
        delta_w_out = delta_z_out * hid_node 
        delta_b_out = delta_z_out
        
        
        #NOW WE BACKPROPOGATE TO THE HIDDEN LAYER 
        #WE NEED DETLA Z OF THE HIDDEN LAYER FOR EACH NODE
        deriv_activ = [sig_deriv(x) for x in hid_node_z]        
        delta_z_hid = np.multiply(delta_z_out, out_w)    
        delta_z_hidden = np.multiply(delta_z_hid, deriv_activ)
        
        #GET DELTA W FOR EACH NODE OF THE HIDDEN LAYER
        delta_w_hidden = np.multiply(delta_z_hidden,hid_node)
        
        #DELTA B IS EQUAL TO DELTA Z 
        delta_b_hidden = delta_z_hidden
        
        #######################################################################################
        #MODIFY THE WEIGHTS AND THE BIAS VALUES USING THE CALCUALTED VALUES AND LEARNING RATE
        #######################################################################################
        val = np.multiply(learn_rate,delta_b_hidden)
        b1 = np.subtract(b1, val)
        
        val = np.multiply(learn_rate,delta_w_hidden)
        hid_w = np.subtract(hid_w, val)
        
        val = np.multiply(learn_rate,delta_w_out)
        out_w = np.subtract(out_w, val)
        
        val = np.multiply(learn_rate,delta_b_out)
        b2 = np.subtract(b2, val)
        
    return hid_w, out_w, b1, b2, hid_size
        

#THIS FUNCTION TAKES IN THE TRAINING WEIGHTS AND TRIED TO PREDICT THE VALUES OF THE TEST SAMPLES
#AN ACCURACY IS THEN RETURNED BASED ON HOW WELL THE NEURAL NETWORK PREDICTS THE VALUES
def predict(hidden_weights, output_weights, bias_1, bias_2, attributes, labels, hid_size):
    correct = 0
    total = 0
    
    #LOOP THROUGH ALL THE TEST CASES 
    for t in range(len(attributes)):        
        
        #FORWARD PROPAGATION
        #HIDDEN LAYER
        
        #print(attributes[t])
        #print(hidden_weights)
        hid_node_z = np.dot(attributes[t],hidden_weights) + b1
        hid_node = sig(hid_node_z)
            
        #OUTPUT LAYER
        output = np.dot(hid_node, output_weights) + b2 
        y_hat = sig(output)
        
        #IF THE YHAT VALUE IS ABOVE 0.5 SET IT TO 1 AND IF LESS SET TO 0
        if y_hat >= 0.5:
            y_hat = 1
        else:
            y_hat = 0
            
        #IF THE PREDICTION IS CORRECT INCREASE CORRECT 
        if (y_hat == labels[t]):
            correct = correct + 1
            total = total + 1
        else:
            total = total + 1
        
        #print("This is the prediected value %f and this is the actual value %d" %(y_hat ,labels[t]))
    
    #RETURN ACCURACY
    accuracy = (correct/total)
    return accuracy


**Part 4 - Test Shallow Neural Network**

Now we will run the shallow neural network on both datasets as was done with the logistic regression algorithm.

In [1035]:
max_iter = 100000
learn_rate = 0.001
thresh = 0.000001
    
#TEST ON LINEARLY SEPERABLE DATASET
df = pd.read_csv("C:/Users/lhaye/Downloads/blobs250.csv")
num_attr, Xtrain, Xtest, Xval, Ytrain, Ytest, Yval, train_set, test_set, val_set = get_data(df)
    
hid_w, out_w, b1, b2, hid_size = training(num_attr, max_iter, Xtrain, Ytrain, learn_rate, thresh)

accuracy = predict(hid_w, out_w, b1, b2, Xtest, Ytest, hid_size)
print("Accuracy for the linearly seperable dataset: ", accuracy)


print(" ")
print(" ")


#TEST ON NON-LINEARLY SEPERABLE DATASET
df = pd.read_csv("C:/Users/lhaye/Downloads/moons400.csv")
num_attr, Xtrain, Xtest, Xval, Ytrain, Ytest, Yval, train_set, test_set, val_set = get_data(df)
    
hid_w, out_w, b1, b2, hid_size = training(num_attr, max_iter, Xtrain, Ytrain, learn_rate, thresh)

accuracy = predict(hid_w, out_w, b1, b2, Xtest, Ytest, hid_size)
print("Accuracy for the non-linearly seperable dataset: ", accuracy)

Cost function for this epoch is  [0.00031455]
Accuracy for the linearly seperable dataset:  1.0
 
 
Cost function for this epoch is  [3.23859413e-06]
Cost function for this epoch is  [0.64465785]
Cost function for this epoch is  [0.40931941]
Cost function for this epoch is  [0.35391735]
Cost function for this epoch is  [0.30273805]
Cost function for this epoch is  [0.31922518]
Cost function for this epoch is  [0.30878181]
Cost function for this epoch is  [0.30512534]
Cost function for this epoch is  [0.2997023]
Cost function for this epoch is  [0.28623136]
Cost function for this epoch is  [0.28976325]
Cost function for this epoch is  [0.28038651]
Cost function for this epoch is  [0.26737223]
Cost function for this epoch is  [0.29590647]
Cost function for this epoch is  [0.29718394]
Cost function for this epoch is  [0.31688854]
Cost function for this epoch is  [0.29058712]
Cost function for this epoch is  [0.28815536]
Cost function for this epoch is  [0.29728738]
Cost function for this 

We can see that this shallow neural network performs with an accuracy of 100% on the test set of the linearly seperable data. There is also an accuracy of 90% achieved on the non-linear dataset. This is an increase of over 7% when compared to the logistic regression algorithm.

Now we move on to getting the Cifar data and manipulating it into a way that we can feed it into the neural network. First read in the batch, then convert each image in the batch to a 1024 array. Then remove the indexes that are not an automobile or horse. Next change the values of the horse and automobile class to 0 and 1. Finally split the data into training and test set. 

In [851]:
#NOW LETS MOVE ON TO THE IMAGE DATASET TASK 
import pickle
import io

#FISRT TWO FUNCTIONS TAKE FROM LECTURE CODE
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def loadbatch(batchname):
    folder = 'C:/Users/lhaye/Downloads/cifar-10-batches-py'
    batch = unpickle(folder+"/"+batchname)
    return batch

def load_cifar_data():
    batch1 = loadbatch('data_batch_1')

    attr = batch1[b'data']
    labels = batch1[b'labels']
    
    #READ IN EACH IMAGE IN AS 1024 AS REQUIRED
    #THEREFORE THERE WILL BE 1024 INPUTS INTO THE NEURAL NETWORK
    num_attr = 1024
    #EACH ATTR IS AN IMAGE INPUT
    attr  = [image[0:num_attr] for image in attr]

    #HERE WE JUST TAKE THE INDEXES IF THE LABELS ARE 1 OR 7 AS THIS IS THE INDEX OF HORSE AND AUTOMOBILE
    indexes = [index for index in range(len(labels)) if (labels[index] == 1 or labels[index] == 7)]
    #UPDATE THE ATTRIBUTES TO JUST HAVE THE IMAGES WE WANT ONLY
    attr = [attr[index] for index in indexes]
    #UPDATE THE LABELS ALSO 
    labels = [labels[index] for index in indexes]

    #NOW LETS SET THE LABEL VALUE OF 7 TO 0 AS AUTOMOBILE IS ALREADY 1 SO HORSE WILL NOW BE 0 
    #THIS MAKES CLASSIFICATION EASIER LATER
    for key, val in enumerate(labels):
        if(val == 7):
            labels[key] = 0

    #SPLIT DATA INTO TRAIN AND TEST AND NORMALIZE
    train_attr = np.array(attr[300:]) / 255 
    train_labels = np.array(labels[300:]).T

    test_attr = np.array(attr[:300]) / 255 
    #print(len(train_attr))
    test_labels = np.array(labels[:300]).T
    
    val = train_attr[0]

    return num_attr, train_attr, test_attr, train_labels, test_labels


Then run this shallow neural network on the Cifar data that has just been prepared.

In [1036]:
max_iter = 16750000
learn_rate = 0.001
thresh = 0.000001

num_attr, train_attributes, test_attributes, train_labels, test_labels= load_cifar_data()

hid_w, out_w, b1, b2, hid_size = training(num_attr, max_iter, train_attributes, train_labels, learn_rate, thresh)

accuracy = predict(hid_w, out_w, b1, b2, test_attributes, test_labels, hid_size)
print("Accuracy for the Neural Network on the Cifar Dataset - Horse vs Automobile Classification: ", accuracy)

Cost function for this epoch is  [0.0077466]
Cost function for this epoch is  [1.1354225]
Cost function for this epoch is  [0.90812979]
Cost function for this epoch is  [0.78954652]
Cost function for this epoch is  [0.72989067]
Cost function for this epoch is  [0.64835109]
Cost function for this epoch is  [0.67282463]
Cost function for this epoch is  [0.6099486]
Cost function for this epoch is  [0.59159007]
Cost function for this epoch is  [0.55620834]
Cost function for this epoch is  [0.58020773]
Cost function for this epoch is  [0.57579076]
Cost function for this epoch is  [0.50394152]
Cost function for this epoch is  [0.4878809]
Cost function for this epoch is  [0.49285904]
Cost function for this epoch is  [0.45893646]
Cost function for this epoch is  [0.44502811]
Cost function for this epoch is  [0.45138609]
Cost function for this epoch is  [0.43990271]
Cost function for this epoch is  [0.45937504]
Accuracy for the Neural Network on the Cifar Dataset - Horse vs Automobile Classific

The accuracy value for the shallow neural network on the Cifar image test set is 75%. This result is quite impressive considering we have just used the image alone. 

**Part 5 - Neural Network Enhancement - Adaptable Number of Hidden Layers**
The enhancement I chose was a neural network with an adaptable number of hidden layers. This meant that any number could be passed into the algorithm to create N number of hidden layers. For the sake of simplicity each hidden layer has the same number of nodes and this is something that could be further developed in future work.

In [957]:
"""
USE NUMPY VECTORIZATION AS FAR AS POSSIBLE TO SPEED UP BOTH TRAINING AND TESTING
"""
#SIGMOID FUNCTION
def sig(x):
    return 1 / (1 + np.exp(-x))

#SIGMOID DERIVATIVE FUNCTION 
def sig_deriv(x):
    return np.multiply(sig(x), 1 - sig(x))


def training2(num_attr, max_iter, attribute, label, learn_rate, thresh, hid_layers):
    
    #INITIALIZATION     
    hid_size = 600
    input_size = num_attr
    out_size = 1
    
    
    #INITIALISE WEIGHTS AND BIAS
    out_w = np.random.uniform(-1,1,size=(hid_size))
    b_out = np.random.uniform(-0.5,0.5,size=(out_size))
    
    hidnew = {}
    b = {}
    j_cur = 0
    cost_total = 0
    epoch = 0
    
    
    #HERE WE CREATE THE WEIGHTS FOR THE SELECTED NUMBER OF HIDDEN LAYERS AND PUT THE WEIGHTS INTO A DICTIONARY
    for i in range(hid_layers):
        if (i == 0):
            hidnew["hlayer" + str(i)] = np.random.uniform(-1,1,size=(input_size , hid_size))
            b["hlayer" + str(i)] = np.random.uniform(-0.5,0.5,size=(hid_size))

        else:
            hidnew["hlayer" + str(i)] = np.random.uniform(-1,1,size=(hid_size , hid_size))
            b["hlayer" + str(i)] = np.random.uniform(-0.5,0.5,size=(hid_size))

            
    for i in range(max_iter):
                
        #GET INDEX OF RANDOM TRAINING SAMPLE
        select = random.randint(0, len(attribute) - 1)

        #GET THE TRAINING VALUES AND THE OUTPUT(Y) VALUE
        x = attribute[select]
        yval = label[select]
        
        
        ######################
        #FORWARD PROPOGATION
        ######################
        
        hid_node_z = {}
        hid_node = {}
        
        #FORWARD PROPAGATION FOR ALL THE HIDDEN LAYERS
        for j in range(hid_layers):
            if (j == 0):
                hid_node_z["hlayer" + str(j)] = np.dot(x,hidnew["hlayer" + str(j)])
                hid_node_z["hlayer" + str(j)] = np.add(hid_node_z["hlayer" + str(j)], b["hlayer" + str(j)])
                hid_node["hlayer" + str(j)] = sig(hid_node_z["hlayer" + str(j)])
                
            else:
                hid_node_z["hlayer" + str(j)] = np.dot(hid_node["hlayer" + str(j-1)],hidnew["hlayer" + str(j)])
                hid_node_z["hlayer" + str(j)] = np.add(hid_node_z["hlayer" + str(j)], b["hlayer" + str(j)])
                hid_node["hlayer" + str(j)] = sig(hid_node_z["hlayer" + str(j)]) 
                
            if (j == hid_layers-1):
                
                output = np.dot(hid_node["hlayer" + str(j)], out_w) + b_out

                #GET OUTPUT
                y_hat = sig(output)
                
                j_prev = j_cur
                j_cur = -(yval*np.log(y_hat)+(1-yval)*np.log(1-y_hat))
                
                cost_total += j_cur
                
                if(i%1675==0):
                    
                    epoch = epoch + 1
                    print("Cost function for this spoch is ", (cost_total/1675))
                    cost_total = 0
                
                #RETURN IF WE REACH THE NUMBER OF EPOCHS OR THE THRESHOLD
                if(i!=0 and abs(j_prev - j_cur) <= thresh or epoch == 40):
                    return hidnew, out_w, b, b_out, hid_size
                
                ##NOW WE NEED TO BACKPROPAGATE
                delta_z_out = y_hat - yval
                delta_w_out = delta_z_out * hid_node["hlayer" + str(j)]
                delta_b_out = delta_z_out
                
        deriv_activ = {}
        delta_z_hid = {}
        delta_z_hidden = {}
        delta_w_hidden = {}
        delta_b_hidden = {}
                
        for j in reversed(range(hid_layers)):
 
            #IF WE ARE AT THE LAST HIDDEN LAYER
            if (j == hid_layers-1):
        
                #NOW WE BACKPROPOGATE TO THE HIDDEN LAYER 
                deriv_activ["hlayer" + str(j)] = [sig_deriv(x) for x in hid_node_z["hlayer" + str(j)]]        
                delta_z_hid["hlayer" + str(j)] = np.multiply(delta_z_out, out_w)    
                delta_z_hidden["hlayer" + str(j)] = np.multiply(delta_z_hid["hlayer" + str(j)], deriv_activ["hlayer" + str(j)])

                #GET DELTA W FOR EACH NODE OF THE HIDDEN LAYER
                delta_w_hidden["hlayer" + str(j)] = np.multiply(delta_z_hidden["hlayer" + str(j)],hid_node["hlayer" + str(j)])

                #DELTA B IS EQUAL TO DELTA Z 
                #print("hlayer" + str(i))
                delta_b_hidden["hlayer" + str(j)] = delta_z_hidden["hlayer" + str(j)]
            
            #ALL OTHER HIDDEN LAYERS
            else:
                
                #NOW WE BACKPROPOGATE TO THE HIDDEN LAYER 
                deriv_activ["hlayer" + str(j)] = [sig_deriv(x) for x in hid_node_z["hlayer" + str(j)]]   
                delta_z_hid["hlayer" + str(j)] = np.dot(delta_z_hidden["hlayer" + str(j+1)], hidnew["hlayer" + str(j+1)])    
                delta_z_hidden["hlayer" + str(j)] = np.multiply(delta_z_hid["hlayer" + str(j)], deriv_activ["hlayer" + str(j)])
                
                #GET DELTA W FOR EACH NODE OF THE HIDDEN LAYER
                delta_w_hidden["hlayer" + str(j)] = np.multiply(delta_z_hidden["hlayer" + str(j)],hid_node["hlayer" + str(j)])
                
                #DELTA B IS EQUAL TO DELTA Z 
                delta_b_hidden["hlayer" + str(j)] = delta_z_hidden["hlayer" + str(j)]        
        
        
    
        for j in reversed(range(hid_layers)):
            
            val = np.multiply(learn_rate,delta_b_hidden["hlayer" + str(j)])
            b["hlayer" + str(j)] = np.subtract(b["hlayer" + str(j)], val)

            
            ###THE PROBLEM IS WITH DELTA W HIDDEN
            valnew = np.multiply(learn_rate,delta_w_hidden["hlayer" + str(j)])
            
            
            hidnew["hlayer" + str(j)] = np.subtract(hidnew["hlayer" + str(j)], valnew)
        
        
        val = np.multiply(learn_rate,delta_w_out)
        out_w = np.subtract(out_w, val)
        
        val = np.multiply(learn_rate,delta_b_out)
        b_out = np.subtract(b_out, val)
    

    
    return hidnew, out_w, b, b_out, hid_size

In [958]:
#MUST ODIFY TO WORK WITH THE NEW EXRTA LAYER
def predict2(hidden_weights, output_weights, bs_1, bs_2, attributes, labels, hid_layers):
    correct = 0
    total = 0    
    
    #LOOP THROUGH ALL THE TEST CASES 
    for t in range(len(attributes)):        
        
        hid_node_z_new = {}
        hid_node_new = {}
        
        for i in range(hid_layers):
            
            #FORWARD PROPAGATION
            #HIDDEN LAYER
            if (i == 0):
                nval = np.dot(attributes[t],hidden_weights["hlayer" + str(i)])
                hid_node_z_new["hlayer" + str(i)] = np.add(nval, bs_1["hlayer" + str(i)])
                hid_node_new["hlayer" + str(i)] = sig(hid_node_z_new["hlayer" + str(i)]) 

            else:
                nval = np.dot(hid_node_new["hlayer" + str(i-1)],hidden_weights["hlayer" + str(i)])
                hid_node_z_new["hlayer" + str(i)] = np.add(nval, bs_1["hlayer" + str(i)])
                hid_node_new["hlayer" + str(i)] = sig(hid_node_z_new["hlayer" + str(i)])     
                
        hid_node_new_val = hid_node_new["hlayer" + str(i)]
            
        #OUTPUT LAYER
        output = np.dot(hid_node_new_val, output_weights) + bs_2 
        y_hat = sig(output)

        
        #IF THE YHAT VALUE IS ABOVE 0.5 SET IT TO 1 AND IF LESS SET TO 0
        if y_hat >= 0.5:
            y_hat = 1
        else:
            y_hat = 0
            
            
        #IF THE PREDICTION IS CORRECT INCREASE CORRECT 
        if (y_hat == labels[t]):
            correct = correct + 1
            total = total + 1
        else:
            total = total + 1
        
    
    #RETURN ACCURACY
    accuracy = (correct/total)
    return accuracy

In [961]:
max_iter = 16750000
learn_rate = 0.001
thresh = 0.000001
hid_layers = 2

num_attr, train_attributes, test_attributes, train_labels, test_labels= load_cifar_data()

hidnew, out_w, b, b_out, hid_size = training2(num_attr, max_iter, train_attributes, train_labels, learn_rate, thresh, hid_layers)
print("training done")
accuracy = predict2(hidnew, out_w, b, b_out, test_attributes, test_labels, hid_layers)

print(accuracy)

Cost function for this spoch is  [2.01766451e-05]
Cost function for this spoch is  [1.31667832]
Cost function for this spoch is  [1.00090121]
Cost function for this spoch is  [0.96105789]
Cost function for this spoch is  [0.91457859]
Cost function for this spoch is  [0.87195607]
Cost function for this spoch is  [0.81610489]
Cost function for this spoch is  [0.78939338]
Cost function for this spoch is  [0.7793564]
Cost function for this spoch is  [0.70327609]
Cost function for this spoch is  [0.70587692]
Cost function for this spoch is  [0.64111296]
Cost function for this spoch is  [0.64840017]
Cost function for this spoch is  [0.61516547]
Cost function for this spoch is  [0.64007675]
Cost function for this spoch is  [0.58890889]
Cost function for this spoch is  [0.56825849]
Cost function for this spoch is  [0.56727532]
Cost function for this spoch is  [0.55874823]
Cost function for this spoch is  [0.5574643]
Cost function for this spoch is  [0.53626105]
Cost function for this spoch is 

The results shown here are the results of running the algorithm with two hidden layers. This result slightly better in accuracy than the algorithm with just one hidden layer. It achieves an accuracy of 3% better. This improvement is not seen once more layers are added however. The more hidden layers that are added the further the performance of the algorithm degrades. I believe this may be due to the vanishing gradient problem that becomes more of a problem as more layers are added. This issue means that as the derivates are multiplied by each other when backproagating meaning the gradient will be smaller. This means that in the layers at the beginning of the network will experience very little change when training. This is a common issue when using the sigmoid activation function and therefore a better implementation would be to use the tanh function or another such activation function. This is subsequent work that could be carried out. We can see this decrease in performance in the use of 3 layers below as the performance decreases to 65% which is about a 10% decrease in accuracy.

In [1037]:
max_iter = 16750000
learn_rate = 0.001
thresh = 0.000001
hid_layers = 3

num_attr, train_attributes, test_attributes, train_labels, test_labels= load_cifar_data()

hidnew, out_w, b, b_out, hid_size = training2(num_attr, max_iter, train_attributes, train_labels, learn_rate, thresh, hid_layers)
print("training done")
accuracy = predict2(hidnew, out_w, b, b_out, test_attributes, test_labels, hid_layers)

print(accuracy)

Cost function for this spoch is  [5.165084e-05]
Cost function for this spoch is  [1.40804942]
Cost function for this spoch is  [1.06017571]
Cost function for this spoch is  [0.92657833]
Cost function for this spoch is  [0.86878052]
Cost function for this spoch is  [0.81086256]
Cost function for this spoch is  [0.79908394]
training done
0.65
