In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import pixiedust

Pixiedust database opened successfully


# Algorithms

# <center>Neural Network Class</center>

In [2]:
class Layer:
        
        def __init__(self,inputs,neurons,activation,derivative):
            """
                creates a layer in the Neural Network
                
                Inputs:
                    inputs     : the number of nodes in previous layer
                    neurons    : the number of neurons in current layer
                    activation : the activation function to be used
            """
            
            #Weight matrix
            self.W = np.random.randn(neurons,inputs+1)
            
            #Bias
            #self.b = np.zeros((neurons,1))
            
            #activation function
            self.act = activation
            
            #derivative function
            self.div = derivative
            
        
        def feedFoward(self,A_prev):
            
            """
                computes the output of current layer
                
                Inputs:
                    A_prev : outputs from the previous layer
                    
                Outputs:
                    A      : output from current Layer
            """
            m,n = A_prev.shape
            self.A_prev = A_prev
            
            
            ones = np.ones((1,n))
            temp = np.vstack([ones,A_prev])
            
    
            self.Z = self.W @ temp
            self.A = self.act(self.Z)
            
            return self.A
        
        
        def backprop(self,dA,alpha):
            
            """
                calculates the derivatives at the current layer
            """
            
            #learning rate
            
            
            
            div_term = self.div(self.A_prev)
            dA_prev = (self.W.T[1:] @ dA) * div_term
           
            return dA_prev
        
        
        
        def set_weights(self,W):
            self.W = W
        




class Neural_Network:
    
    def __init__(self,param,func,deriv):
        
        """
            Initialises NN
            
            Inputs:
                param : the dimensions of the NN
                func  : the activation function that will be used
                deriv : derivative of func
        """
        
        
        #ATTRIBUTES
        
        #number of layers
        self.s = len(param)
        
        #the activation function
        self.func = func
        
        #the derivative of activation function
        self.deriv = deriv
        
        #creating layers#
        self.Layers = []
        
        
        #deltas#
        self.delta = []
        
        #A values#
        self.A_list = []
        
        for i in range(1,len(param),1):
            
            temp = Layer(param[i-1],param[i],self.func,self.deriv)
            self.Layers.append(temp)
        
        
        
    def predict(self,X):
        
        """
            Makes prediction off the dataset given
        """
        
        A = X.T
        self.A_list = [A]
        
        for layer in self.Layers:
            A = layer.feedFoward(A)
            self.A_list.append(A)
        
        #print(self.A_list)
        return A
        
        
    def train(self,X,y,alpha,epoch):
        
        for j in range(epoch):
            A = self.predict(X)
        
            dA = A - y

            self.delta = [dA]

            
            #calculating delta errors
            for i in range(len(self.Layers)-1,0,-1):

                dA = self.Layers[i].backprop(dA,alpha)
                self.delta.insert(0,dA)


            #getting update matrices
            up_D  = self.Deltas()
            
            i = 0
            
            for layer in self.Layers:
                #TODO: Regularise here
                wNew = (alpha* 1/len(y)) * up_D[i]
                layer.W = layer.W - wNew
                i+=1

    
    def Deltas(self):
        
        upper_D = []
        
        for i in range(0,len(self.delta),1):
                     
            m,n = self.A_list[i].shape
            ones = np.ones((1,n))
            aValue = np.vstack([ones,self.A_list[i]])
            
            
            temp = self.delta[i] @ aValue.T
            upper_D.append(temp)
    
        return upper_D
    
    
    def setWeights(self,W,index):
        
        self.Layers[index].set_weights(W)
          

# <center>Logistic Model</center>

# Logistic Regression

using the definition of dot product, logistic regression can be simplified to this expression:
$$\theta = \theta - \frac{\alpha}{m}(h(\theta X^{T}) - y) \cdot X^{T} + \lambda \theta$$

where:

$\alpha$ : the learning rate

$m$ : the number of features

$\theta$ : the initial learning parameters

$X$ : the data given in the design matrix format

$\lambda$ : the regularisation term

$h(\theta X^{T})$ : 

In [3]:
def LogisticRegression(X,y,theta0,alpha,tol,L,num_iter):
    """
        computes theta values using the psuedo inverse
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
            
        outputs:
            theta    : the learning parameters given the model
    """
    
    
    sigmoid = lambda z: 1/(1+np.exp(-z))
    
    i = 1
    V = np.copy(theta0)
    V[0] = 0
    theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
    
    while np.linalg.norm(theta_new-theta0) >tol and i <=num_iter:
        i+=1
        theta0 = theta_new
        V = np.copy(theta0)
        V[0] = 0
        theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
    
    return theta_new

In [4]:
def y_subset(y,value):
    
    """
        creates the sub labels for that specific label to train multiclass logistic regression
        
        inputs:
            y        : the true labels associated with the data
            value    : the label that will be used to generate the y_subset
        
        outputs:
            y_subset : the labels that will be used to train the logistic regression 
    """
    
    ans = []
    
    for i in range(0,len(y),1):
        
        if(y[i]==value):
            ans.append(1)
        else:
            ans.append(0)
    return ans

In [5]:
def One_vs_all(X,y,Theta0,alpha,tol,L,num_iter):

    """
        computes theta Matrix using the logistic Regression
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
            
        outputs:
            theta    : the learning parameters given the model
    """
    
    outcomes = np.unique(y)
    Param = []
    
    for i in range(len(outcomes)):
        
        value = outcomes[i]
        y_sub = y_subset(y,value)
        temp_theta = Theta0[i,:]
        temp_param = LogisticRegression(X,y_sub,temp_theta,alpha,tol,L,num_iter)
        Param.append(temp_param)
        
    return np.array(Param)

In [6]:
class LogisticModel:

    def __init__(self,multiclass):

        """
            intialises the Logistic model class

            Inputs:
                multiclass: specifics if model is a binary classifier or not
        """

        self.W = None
        self.multiclass = multiclass

    def train(self,X,y,Theta0,alpha,tol,L,num_iter):

        """
        computes theta Matrix using the logistic Regression
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
        """

        if(self.multiclass == True):
            self.W = self.One_vs_all(X,y,Theta0,alpha,tol,L,num_iter)
        else:
            Self.W = self.LogisticRegression(X,y,theta0,alpha,tol,L,num_iter)
            

    def LogisticRegression(self,X,y,theta0,alpha,tol,L,num_iter):
        """
            computes theta values using the psuedo inverse
            
            inputs:
                X        : data in the form of the design matrix
                y        : the labels associated with the data
                theta0   : the intial guess on the learning parameters
                alpha    : the learning rate
                tol      : the margin of error
                num_iter : the number of times algorithms must loop
                L        : the regularisation parameter(lambda)
                
            outputs:
                theta    : the learning parameters given the model
        """
        
        
        sigmoid = lambda z: 1/(1+np.exp(-z))
        
        i = 1
        V = np.copy(theta0)
        V[0] = 0
        theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
        
        while np.linalg.norm(theta_new-theta0) >tol and i <=num_iter:
            i+=1
            theta0 = theta_new
            V = np.copy(theta0)
            V[0] = 0
            theta_new  = theta0 - alpha*(sigmoid(theta0 @ X.T) - y) @ X + L*V
        
        return theta_new


    def One_vs_all(self,X,y,Theta0,alpha,tol,L,num_iter):

        """
        computes theta Matrix using the logistic Regression
        
        inputs:
            X        : data in the form of the design matrix
            y        : the labels associated with the data
            theta0   : the intial guess on the learning parameters
            alpha    : the learning rate
            tol      : the margin of error
            num_iter : the number of times algorithms must loop
            L        : the regularisation parameter(lambda)
            
        outputs:
            theta    : the learning parameters given the model
        """
    
        outcomes = np.unique(y)
        Param = []
    
        for i in range(len(outcomes)):
        
            value = outcomes[i]
            y_sub = self.y_subset(y,value)
            temp_theta = Theta0[i,:]
            temp_param = self.LogisticRegression(X,y_sub,temp_theta,alpha,tol,L,num_iter)
            Param.append(temp_param)
        
        return np.array(Param)



    def y_subset(self,y,value):
    
        """
            creates the sub labels for that specific label to train multiclass logistic regression
            
            inputs:
                y        : the true labels associated with the data
                value    : the label that will be used to generate the y_subset
            
            outputs:
                y_subset : the labels that will be used to train the logistic regression 
        """
        
        ans = []
        
        for i in range(0,len(y),1):
            
            if(y[i]==value):
                ans.append(1)
            else:
                ans.append(0)
        return ans

### <center>Confusion Matrix</center>

In [7]:
def confusion_matrix(output,y):
    
   
    outcomes = list(np.unique(y))
    
    matrix = np.zeros((len(outcomes),len(outcomes)))
    
    for i in range(0,len(y),1):
        
        predicted_value = output[i]
        true_value = y[i]
        
        row = outcomes.index(predicted_value)
        col = outcomes.index(true_value)
        
        matrix[row,col] += 1
        
    
    error = 0
    
    for i in range(0,len(outcomes),1):
        error+= matrix[i][i]
        
    error/= len(y)
    con_matrix = pd.DataFrame(data = matrix,index= outcomes,columns=outcomes)
    return error,con_matrix

# Cleaning the data

In [11]:
#Importing the data#
Dataset = pd.read_csv("Data/carData.csv")

#printing the first 10 values
Dataset.head(10)

Unnamed: 0,Buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [12]:
#printing the last 10 values#
Dataset.tail(10)

Unnamed: 0,Buying,maint,doors,persons,lug_boot,safety,class
1718,low,low,5more,4,big,high,vgood
1719,low,low,5more,more,small,low,unacc
1720,low,low,5more,more,small,med,acc
1721,low,low,5more,more,small,high,good
1722,low,low,5more,more,med,low,unacc
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good
1727,low,low,5more,more,big,high,vgood


In [13]:
# information on dataset
print(Dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
Buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB
None


In [14]:
"""
    cleaning the dataset. putting data into correct format

    Summary of the data

    | class values

    unacc, acc, good, vgood

    | attributes

    buying:   vhigh, high, med, low.
    maint:    vhigh, high, med, low.
    doors:    2, 3, 4, 5more.
    persons:  2, 4, more.
    lug_boot: small, med, big.
    safety:   low, med, high.



    class values replacement

        unacc = 0
        acc = 1
        good = 2
        vgood = 3


    buying values replacement:

        low = 0
        med = 1
        high = 2
        vhigh = 3


    doors replacement:

        5more = 5

    lug_boot replacement:

        small = 0
        med = 1
        big =2

    safety replacement:

        low = 0
        med = 1
        high =2
"""

formatedData = Dataset

#replace class atributes#
formatedData = formatedData.replace(to_replace="unacc",value=0)
formatedData = formatedData.replace(to_replace="acc",value=1)
formatedData = formatedData.replace(to_replace="good",value=2)
formatedData = formatedData.replace(to_replace="vgood",value=3)

#buying value replacement#
formatedData = formatedData.replace(to_replace = "low",value = 0)
formatedData = formatedData.replace(to_replace = "med",value = 1)
formatedData = formatedData.replace(to_replace = "high",value = 2)
formatedData = formatedData.replace(to_replace = "vhigh",value = 3)

#door replacement#
formatedData = formatedData.replace(to_replace = "5more",value = 5)
formatedData['doors'] = pd.to_numeric(formatedData['doors'])

#persons replacement
formatedData = formatedData.replace(to_replace = "more",value = 5)
formatedData['persons'] = pd.to_numeric(formatedData['persons'])

#lug_boot#
formatedData = formatedData.replace(to_replace = "small",value = 0)
formatedData = formatedData.replace(to_replace = "big",value = 2)

#summary of data#
formatedData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
Buying      1728 non-null int64
maint       1728 non-null int64
doors       1728 non-null int64
persons     1728 non-null int64
lug_boot    1728 non-null int64
safety      1728 non-null int64
class       1728 non-null int64
dtypes: int64(7)
memory usage: 94.6 KB


In [15]:
formatedData.tail(20)

Unnamed: 0,Buying,maint,doors,persons,lug_boot,safety,class
1708,0,0,5,2,2,1,0
1709,0,0,5,2,2,2,0
1710,0,0,5,4,0,0,0
1711,0,0,5,4,0,1,1
1712,0,0,5,4,0,2,2
1713,0,0,5,4,1,0,0
1714,0,0,5,4,1,1,2
1715,0,0,5,4,1,2,3
1716,0,0,5,4,2,0,0
1717,0,0,5,4,2,1,2


In [16]:
Dataset.tail(20)

Unnamed: 0,Buying,maint,doors,persons,lug_boot,safety,class
1708,low,low,5more,2,big,med,unacc
1709,low,low,5more,2,big,high,unacc
1710,low,low,5more,4,small,low,unacc
1711,low,low,5more,4,small,med,acc
1712,low,low,5more,4,small,high,good
1713,low,low,5more,4,med,low,unacc
1714,low,low,5more,4,med,med,good
1715,low,low,5more,4,med,high,vgood
1716,low,low,5more,4,big,low,unacc
1717,low,low,5more,4,big,med,good


In [20]:
#Exporting formated data#
formatedData.to_csv("Data/formated.csv")

In [18]:
formatedData.corr()

Unnamed: 0,Buying,maint,doors,persons,lug_boot,safety,class
Buying,1.0,0.0,0.0,0.0,0.0,0.0,-0.28275
maint,0.0,1.0,0.0,0.0,0.0,0.0,-0.232422
doors,0.0,0.0,1.0,0.0,0.0,0.0,0.066057
persons,0.0,0.0,0.0,1.0,2.5236509999999998e-19,0.0,0.373459
lug_boot,0.0,0.0,0.0,2.5236509999999998e-19,1.0,0.0,0.157932
safety,0.0,0.0,0.0,0.0,0.0,1.0,0.439337
class,-0.28275,-0.232422,0.066057,0.3734592,0.1579317,0.439337,1.0
