In [1]:
import numpy as np
from random import shuffle
import csv

wine_raw_data = []
with open('winequality-red.csv', 'r') as csvFile:
    reader = csv.reader(csvFile)
    for index, item in enumerate(reader):
        if (index == 0):
            continue
        else:
            individual_row = np.array(item[0].split(';'))
            #Converting to binary
            if ( int(individual_row[11]) >= 6):
                individual_row[11] = 1
            else:
                individual_row[11] = 0
                
            wine_raw_data.append(list(map(float, individual_row)))
            
csvFile.close()
shuffle(wine_raw_data)
wine_features = []
wine_labels = []

for row in wine_raw_data:
    wine_labels.append(row[len(row)-1])
    wine_features.append(row[0:len(row)-1])

wine_features = np.asarray(wine_features)
wine_labels = np.asarray(wine_labels)

    
cancer_raw_data = []

with open('breast-cancer-wisconsin.data', 'r') as dataFile:
    cancer_reader = csv.reader(dataFile)
    for row in cancer_reader:
        #Removing bad data
        if '?' in row:
            continue
        else:
            cancer_raw_data.append(list(map(int, row)))
dataFile.close()
shuffle(cancer_raw_data)

cancer_features = []
cancer_labels = []

for row in cancer_raw_data:
    cancer_labels.append(row[len(row)-1])
    cancer_features.append(row[1:len(row)-1])

cancer_features = np.asarray(cancer_features)
cancer_labels = np.asarray(cancer_labels)

for index, element in np.ndenumerate(cancer_labels):
    if (element == 2):
        cancer_labels[index] = 0
    else:
        cancer_labels[index] = 1


In [2]:
import time
class LDA:
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
        self.cov = np.cov(self.features.T)
        self.P_1 = self.__label_prop(1, self.labels)
        self.P_0 = 1 - self.P_1
        self.mean_1 = self.__label_mean(1, self.features, self.labels)
        self.mean_0 = self.__label_mean(0, self.features, self.labels)
        

    def __label_prop(self, label, labels):
        count = 0;
        for i in labels:
            if i == label:
                count += 1
        return count/len(labels)

    def __label_mean(self, label, features, labels):
        mean = [0] * len(features[0])
        count = 0
        for i in range(len(features)):
            if labels[i] == label:
                count += 1
                mean += features[i]

        return mean/count
    

    def predict(self, x):
        cov_inv = np.linalg.inv(self.cov)
        t1 = np.log(self.P_1/self.P_0)
        t2 = -0.5 * np.matmul(np.matmul(self.mean_1.T, cov_inv), self.mean_1)
        t3 = 0.5 * np.matmul(np.matmul(self.mean_0.T, cov_inv), self.mean_0)
        t4 = np.matmul(np.matmul(x.T, cov_inv), (self.mean_1-self.mean_0))
        lda = t1 + t2 + t3 + t4
        if(lda > 0):
            return 1
        else:
            return 0

In [6]:
def accuracy(predicted, actual):
    count = 0
    for i in range(len(actual)):
        if predicted[i] == actual[i]:
            count += 1

    return 100 * count/len(actual)

def training_data_set(n, size, features, labels):
    #index_array contains all the indexes of the elements we want to remove
    index_array = []
    first_element_index = n*size
    for x in range(size):
        index_array.append(first_element_index + x)
        
    X_training_set = features
    X_filtered_set = []

    Y_training_set = labels
    Y_filtered_set = []
    
    for y in range(len(features)):
        if y in index_array:
            continue
        else:
            X_filtered_set.append(X_training_set[y])
            Y_filtered_set.append(Y_training_set[y])
    
    #X_Filtered_set and Y_Filtered_set contain the training examples we want.
    #That is the training data, not the validation data
    return X_filtered_set, Y_filtered_set

def kfold_script(folds, features, labels):
    fold_size = len(features)//folds
    average = 0
    for x in range(folds):
        Training_fold = training_data_set(x, fold_size, features, labels)
        X_Training = Training_fold[0]
        Y_Training = Training_fold[1]

        X_Training = np.asarray(X_Training)
        Y_Training = np.asarray(Y_Training)

        classifier = LDA(X_Training, Y_Training)

        X_test = features[x*fold_size:x*fold_size + fold_size]
        Y_test = labels[x*fold_size:x*fold_size + fold_size]

        predictions = []
        for i in range(len(X_test)):
            predictions.append(classifier.predict(X_test[i]))

        acc = accuracy(predictions, Y_test)
        average += acc
        print(acc)
    print('average ', average/folds)

folds = 5
print("cancer")
kfold_script(folds, cancer_features, cancer_labels)
print("wine")
kfold_script(folds, wine_features, wine_labels)

cancer
96.32352941176471
94.11764705882354
97.05882352941177
90.44117647058823
92.6470588235294
average  94.11764705882352
wine
74.92163009404389
72.41379310344827
71.15987460815047
76.8025078369906
75.86206896551724
average  74.23197492163008


In [7]:
##Some more statistics

print(np.mean(cancer_features, axis=0))
print(np.median(cancer_features, axis=0))
print(np.average(cancer_features, axis=0))
print(np.std(cancer_features, axis=0))
print(np.var(cancer_features, axis=0))
print(np.amin(cancer_features, axis=0))
print(np.amax(cancer_features, axis=0))

# pd.DataFrame(cancer_data, columns=cancer_titles)
print("WINE ----")
print(np.mean(wine_features, axis=0))
print(np.median(wine_features, axis=0))
print(np.average(wine_features, axis=0))
print(np.std(wine_features, axis=0))
print(np.var(wine_features, axis=0))
print(np.amin(wine_features, axis=0))
print(np.amax(wine_features, axis=0))

[4.44216691 3.15080527 3.21522694 2.83016105 3.23426061 3.54465593
 3.44509517 2.86969253 1.60322108]
[4. 1. 1. 1. 2. 1. 3. 1. 1.]
[4.44216691 3.15080527 3.21522694 2.83016105 3.23426061 3.54465593
 3.44509517 2.86969253 1.60322108]
[2.81869558 3.06290015 2.98639218 2.86246438 2.22145742 3.64118865
 2.44790258 3.05043084 1.73140525]
[ 7.94504479  9.38135733  8.91853827  8.19370232  4.93487306 13.25825475
  5.99222704  9.30512831  2.99776415]
[1 1 1 1 1 1 1 1 1]
[10 10 10 10 10 10 10 10 10]
WINE ----
[ 8.31963727  0.52782051  0.27097561  2.5388055   0.08746654 15.87492183
 46.46779237  0.99674668  3.3111132   0.65814884 10.42298311]
[ 7.9      0.52     0.26     2.2      0.079   14.      38.       0.99675
  3.31     0.62    10.2    ]
[ 8.31963727  0.52782051  0.27097561  2.5388055   0.08746654 15.87492183
 46.46779237  0.99674668  3.3111132   0.65814884 10.42298311]
[1.74055180e+00 1.79003704e-01 1.94740214e-01 1.40948711e+00
 4.70505826e-02 1.04568856e+01 3.28850367e+01 1.88674370e-03
 