In [370]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
import math


In [419]:
def toBinary(dataframe):
    return 1.0 if dataframe > 3.0 else 0.0

In [420]:
b_cancer_df = pd.read_csv('breast-cancer-wisconsin.data', header=None)

b_cancer_df = b_cancer_df.drop(0, axis=1)

b_cancer_df.dropna(inplace=True) # Drops NA values
b_cancer_df.reset_index(drop=True,inplace=True) # Resets indices

b_cancer_df = b_cancer_df.drop(b_cancer_df[b_cancer_df[6] == '?'].index) 

b_cancer_df[10] = list(map(toBinary, b_cancer_df[10]))
#print(b_cancer_df)
#train test split
train = b_cancer_df.sample(frac=0.7, random_state=1)
test_set = b_cancer_df.drop(train.index)

#split test set to validation and test
test = test_set.sample(frac=0.6, random_state=1)
val = test_set.drop(test.index)






In [421]:
def array2d(y_val):
    result = list()
    for element in y_val:
        output = list()
        #element[0] = 0.0 + element[0]
        output.append(element)        
        result.append(output)
    return result
        

In [422]:
X_train = np.array(train.drop(10, 1).values,dtype=np.float32)
y_train = np.array(array2d(train[10].values))



In [423]:
X_test = np.array(test.drop(10, 1).values,dtype=np.float32)
y_test = np.array(array2d(test[10].values))


In [424]:
X_val = np.array(val.drop(10, 1).values, dtype=np.float32)
y_val = np.array(array2d(val[10].values))
print(type(y_val[0]))
print(y_val.shape)

<class 'numpy.ndarray'>
(82, 1)


In [425]:
def featureDesign(features):
    features[5] = features[5]/features[6] # A ratio of free sulphur to total sulphur
    features[5] = round(features[5], 4) # Rounds the ration into 4 decimal points. Potential ERROR!!    
    features.pop(6) # TODO: Consider putting it in the main fuction
    features.pop(10)
    return features
    

In [426]:
def binaryOutput(features):
    if(features[11] > 5):
        features[11] = 1.0 # One implies positive
    else:
        features[11] = 0.0
    return features

In [427]:
with open('winequality-red.csv') as wineFile:
    content = wineFile.readline()
    wine_feature_matrix = list()
    wine_output_matrix = list()
    # Reads the file line by line
    for line in wineFile:
        output = list()
        line = ''.join(line) # Creates a string
        OrigWineMatrix =line.split(';')
        
        # Removes all the white spaces 
        OrigWineMatrix = [x.strip() for x in OrigWineMatrix]
        
        # Converts all the strings into float
        OrigWineMatrix = list(map(float, OrigWineMatrix)) 
        
        # Classifies all wine qualities into positive if > 5 otherwise negative
        OrigWineMatrix = binaryOutput(OrigWineMatrix) 
        
        output.append(OrigWineMatrix[11])
        # Create a list of a list of each output i.e [1], [0], [1]
        
        wine_output_matrix.append(output) 
        
        wine_feature_matrix.append(featureDesign(OrigWineMatrix))
    
    
    wine_output_matrix = np.array(wine_output_matrix) # Create an array of outputs
    wine_feature_matrix = np.array(wine_feature_matrix) # Create a array of features using numpy for matrix calculations
    
    wineFile.close()
    
    print(wine_output_matrix)
    print(wine_output_matrix.shape)
    print(wine_feature_matrix)
    print(wine_feature_matrix.shape)
    
    print(wine_output_matrix[0][0])

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [1.]]
(1599, 1)
[[ 7.4    0.7    0.    ...  3.51   0.56   9.4  ]
 [ 7.8    0.88   0.    ...  3.2    0.68   9.8  ]
 [ 7.8    0.76   0.04  ...  3.26   0.65   9.8  ]
 ...
 [ 6.3    0.51   0.13  ...  3.42   0.75  11.   ]
 [ 5.9    0.645  0.12  ...  3.57   0.71  10.2  ]
 [ 6.     0.31   0.47  ...  3.39   0.66  11.   ]]
(1599, 10)
0.0


In [428]:
#alpha = 0.0005 # Stepsize/learning rate
#iterations = 750 # Number of times we go through the gradient descent logic

class LogisticRegression:
    weights = list()
    def gradient_descent(self, feature_matrix, output_matrix, learningRate, iterations):
        #Initialize weight_matrix to 1s
        weights_matrix = np.ones((np.shape(feature_matrix)[1], 1))
        print(weights_matrix.shape)
        for i in range(iterations):
            
            weights_matrix = weights_matrix + (learningRate * np.dot(feature_matrix.transpose(), (output_matrix - self.sigmoid(np.dot(feature_matrix, weights_matrix)))))
        self.weights.append(weights_matrix)    
        return weights_matrix 
    
    def predict(self, inputX):
        probs = self.sigmoid(sum(np.dot(inputX, self.weights[0])))
        #result = "Probability = " + probs.__str__() + " | classified as negative"
        # "Probability = " + probs.__str__() + " | classified as positive"
        #print("Probs = ", probs)
        if(probs > 0.5): return 1.0
            
        return 0.0 
    def sigmoid(self, a): return 1.0 / (1 + np.exp(-a)) # P(y=1|x) formula
    
    #outputs the accuracy score
    def evaluate_acc(self, featuresX, target_labels):
        false_positive = 0
        false_negative = 0
        true_positive = 0
        true_negative = 0
        for i in range(len(target_labels)):
            predicted_val = self.predict(featuresX[i])
            #
            if(math.isclose(predicted_val, target_labels[i])):
                if(predicted_val > 0):
                   true_positive += 1
                else:
                   true_negative += 1
            else:
                if(predicted_val > 0):
                   false_positive += 1
                else:
                   false_negative += 1
        print("false postive ", false_positive,"false negative ", false_negative,"true postive ", true_positive, "true negative ", true_negative)
        
        return round((true_positive + true_negative)/ (true_positive + false_positive + false_negative + true_negative) * 100, 2)

In [430]:
logisticRegression = LogisticRegression()

weights_result = logisticRegression.gradient_descent(np.mat(X_train), np.mat(y_train), 0.005, 5000)
print(weights_result)
print(logisticRegression.weights)
#print(len(y_val))
pred = logisticRegression.predict(X_test[3])
#print(" Prediction =", pred)
#print(" Actual =", y_test[3])

eval = logisticRegression.evaluate_acc(X_val, y_val)
print(eval)

(9, 1)
[[-3.40590206]
 [ 9.10290435]
 [ 1.93954696]
 [ 1.53089634]
 [-6.48445976]
 [ 4.86590042]
 [-4.24348761]
 [ 2.39740147]
 [-2.21127144]]
[matrix([[-3.40590206],
        [ 9.10290435],
        [ 1.93954696],
        [ 1.53089634],
        [-6.48445976],
        [ 4.86590042],
        [-4.24348761],
        [ 2.39740147],
        [-2.21127144]]), matrix([[-3.40590206],
        [ 9.10290435],
        [ 1.93954696],
        [ 1.53089634],
        [-6.48445976],
        [ 4.86590042],
        [-4.24348761],
        [ 2.39740147],
        [-2.21127144]])]
false postive  6 false negative  3 true postive  27 true negative  46
89.02


In [444]:
class LDA:
    logOdds_function = list()
    def fit_function(self, feature_matrix, output_matrix):
        meansX0 = list()
        meansX1 = list()
        logOddsResult = self.logOdds(output_matrix)
        logOdds = logOddsResult[2]
        print(logOdds)
        means_result = self.mean_features(feature_matrix, output_matrix, logOddsResult[0], logOddsResult[1])
        covariance = self.covariance_calc(feature_matrix, output_matrix, means_result)
        meansX0.append(means_result[0])
        meansX1.append(means_result[1])
        
        function_params = list()
        
        #frst = 0.5 * means_result[1] * np.linalg.inv(covariance) * means_result[1].transpose()
        first = 0.5 * np.dot((np.dot(meansX1,np.linalg.inv(covariance))), np.array(meansX1).transpose())  
        second = 0.5 * np.dot((np.dot(meansX0,np.linalg.inv(covariance))), np.array(meansX0).transpose())
        #econd = 0.5 * means_result[0] * np.linalg.inv(covariance) * means_result[0].transpose()
        #third = feature_matrix.transpose() * np.linalg.inv(covariance) * (means_result[1] - means_result[0])
        
        logOddsRatio_part = logOdds - first + second
        
        function_params.append(logOddsRatio_part) # first part of logodds ratio function - index 0

        function_params.append(np.linalg.inv(covariance)) #  index 1
        function_params.append(np.array(meansX1) - np.array(meansX0)) #index 2
        
        self.logOdds_function.append(function_params) # Accessible from anywhere in the class
        return function_params
    def predict(self, inputX):
        function = self.logOdds_function[0]
        probs = self.sigmoid(inputX)
        
        if (probs > 0):
            return 1.0
        
        return 0.0
    
    def sigmoid(self, inputX):
        function = self.logOdds_function[0]
        print(function[0].shape)
        z = list()
        first = np.dot(inputX.transpose(), np.array(function[1]))
        z.append(first)
        
        print(np.array(z).shape)
        x = function[0] + np.dot(np.array(z), np.array(function[2]).transpose())
        return function[0] + np.dot(np.array(z), np.array(function[2]).transpose())

                              
    def logOdds(self, output_matrix):
        positive = 0
        negative = 0
        result = list()
        
        # Counts the number of positives to the number of negatives
        for element in output_matrix:
            if(element[0] > 0): 
                positive += 1
            else: 
                negative += 1
        
        # evaluates probability of the different class to evaluate the log Odds ratio
        probs_positive = (positive + 1)/(positive + negative + 2)
        probs_negative = (negative + 1)/(positive + negative + 2)
        
        result.append(negative) # Index 0 corresponds to negative 
        result.append(positive) # Index 1 corresponds to positive 
        result.append(math.log((probs_positive/probs_negative))) # Index 2 is the ratio result
        return result
    
    def mean_features(self, feature_matrix, output_matrix, num_negatives, num_positives):
        output_array = list()
        mean_X0 = list()
        mean_X1 = list()
        
        for i in range(len(output_matrix)):
            if(output_matrix[i][0] > 0):
                mean_X1.append(feature_matrix[i])
            else:
                mean_X0.append(feature_matrix[i])
                    
        mean_X0 = np.mean(mean_X0, 0)
        mean_X1 = np.mean(mean_X1,0)
        
        output_array.append(mean_X0)
        output_array.append(mean_X1)
        
        return output_array
    
    def covariance_calc(self, feature_matrix, output_matrix, mean_features):
        matrix_X0 = list()
        matrix_X1 = list()

        for i in range(len(output_matrix)):
            if(output_matrix[i][0] > 0):
                matrix_X1.append(feature_matrix[i])
            else:
                matrix_X0.append(feature_matrix[i])
                
        matrix_X0 = np.array(matrix_X0)
        matrix_X1 = np.array(matrix_X1)
        
        return np.cov(matrix_X0.transpose()) + np.cov(matrix_X1.transpose())
    
    #outputs the accuracy score
    def evaluate_acc(self, featuresX, target_labels):
        false_positive = 0
        false_negative = 0
        true_positive = 0
        true_negative = 0
        for i in range(len(target_labels)):
            predicted_val = self.predict(featuresX[i])
            #
            if(math.isclose(predicted_val, target_labels[i])):
                if(predicted_val > 0):
                   true_positive += 1
                else:
                   true_negative += 1
            else:
                if(predicted_val > 0):
                   false_positive += 1
                else:
                   false_negative += 1
        print("false postive ", false_positive,"false negative ", false_negative,"true postive ", true_positive, "true negative ", true_negative)
        
        return round((true_positive + true_negative)/ (true_positive + false_positive + false_negative + true_negative) * 100, 2)

In [449]:
lda = LDA()
final = lda.fit_function(X_train, y_train)

#Test with a sample of the data
result = lda.predict(X_test[0])

eval = lda.evaluate_acc(X_test, y_test)
print(eval)

-0.6007738604289302
(1, 1)
[ 0.12411591 -0.13074129 -0.07435004 -0.05078648  0.29706796  0.0332303
  0.51423549  0.01171434  0.04748319]
(1, 9)
(1, 1)
[ 0.12411591 -0.13074129 -0.07435004 -0.05078648  0.29706796  0.0332303
  0.51423549  0.01171434  0.04748319]
(1, 9)
(1, 1)
[ 0.11352797 -0.06066941 -0.0269962   0.01418999  0.28284471  0.07442439
  0.10766182  0.03822615  0.02765957]
(1, 9)
(1, 1)
[ 0.46588562 -0.0976482  -0.09905276  0.03745763  0.12478911  0.0490762
  0.31971894  0.04568671  0.04841237]
(1, 9)
(1, 1)
[ 0.07475709  0.02593203  0.07180464 -0.0477755   0.20808429  0.05762398
  0.2522358  -0.01295665  0.03591995]
(1, 9)
(1, 1)
[ 0.23780557 -0.11532296 -0.07512035 -0.00136814  0.29984355  0.05379271
  0.31624262  0.03210614  0.02708081]
(1, 9)
(1, 1)
[ 0.6880081   0.49436888  0.32946123  0.19887884  0.03657109  0.76259388
 -0.37096682  0.45177094  0.86656084]
(1, 9)
(1, 1)
[ 0.7021558   0.17010572 -0.0687566   0.53053754  0.41781342  0.79672039
 -0.13248162  0.21653899 -0.