In [11]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
import math


In [12]:
def featureDesign(features):
    features[5] = features[5]/features[6] # A ratio of free sulphur to total sulphur
    features[5] = round(features[5], 4) # Rounds the ration into 4 decimal points. Potential ERROR!!
    
    features[11] = 1.0
    features.pop(6) # TODO: Consider putting it in the main fuction
    return features
    

In [13]:
def binaryOutput(features):
    if(features[11] > 5):
        features[11] = 1.0 # One implies positive
    else:
        features[11] = 0.0
    return features

In [14]:
with open('winequality-red.csv') as wineFile:
    content = wineFile.readline()
    wine_feature_matrix = list()
    wine_output_matrix = list()
    # Reads the file line by line
    for line in wineFile:
        output = list()
        line = ''.join(line) # Creates a string
        OrigWineMatrix =line.split(';')
        
        # Removes all the white spaces 
        OrigWineMatrix = [x.strip() for x in OrigWineMatrix]
        
        # Converts all the strings into float
        OrigWineMatrix = list(map(float, OrigWineMatrix)) 
        
        # Classifies all wine qualities into positive if > 5 otherwise negative
        OrigWineMatrix = binaryOutput(OrigWineMatrix) 
        
        output.append(OrigWineMatrix[11])
        # Create a list of a list of each output i.e [1], [0], [1]
        
        wine_output_matrix.append(output) 
        
        wine_feature_matrix.append(featureDesign(OrigWineMatrix))
    
    
    wine_output_matrix = np.array(wine_output_matrix) # Create an array of outputs
    wine_feature_matrix = np.array(wine_feature_matrix) # Create a array of features using numpy for matrix calculations
    
    
    print(wine_output_matrix)
    print(wine_output_matrix.shape)
    print(wine_feature_matrix)
    print(wine_feature_matrix.shape)
    
    print(wine_output_matrix[0][0])

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [1.]]
(1599, 1)
[[ 7.4    0.7    0.    ...  0.56   9.4    1.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    1.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    1.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     1.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    1.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     1.   ]]
(1599, 11)
0.0


In [39]:
#alpha = 0.0005 # Stepsize/learning rate
#iterations = 750 # Number of times we go through the gradient descent logic

class LogisticRegression:
    def gradient_descent(self, feature_matrix, output_matrix, learningRate, iterations):
        #Initialize weight_matrix to 1s
        weights_matrix = np.ones((np.shape(feature_matrix)[1], 1))
        print(weights_matrix.shape)
        for i in range(iterations):
            
            weights_matrix = weights_matrix + (learningRate * feature_matrix.transpose() * (output_matrix - self.sigmoid(feature_matrix * weights_matrix)))
            
        return weights_matrix
    
    def predict(self, feature_matrix, weights_matrix):
        probs = self.logistic(sum(feature_matrix * weights_matrix))
        
        result = "Probability = " + probs.__str__() + " | classified as negative"
        
        if(probs > 0.5): result = "Probability = " + probs.__str__() + " | classified as positive"
            
        return result 
    def sigmoid(self, a): return 1.0 / (1 + np.exp(-a)) # P(y=1|x) formula
    
    #outputs the accuracy score
    def evaluate_acc(self, featuresX, true_labels, target_labels): 
        
        return 0

In [40]:
logisticRegression = LogisticRegression()

weights_result = logisticRegression.gradient_descent(np.mat(wine_feature_matrix), np.mat(wine_output_matrix), 0.005, 5000)
print(weights_result)

(11, 1)
[[   9.51456827]
 [-460.08566639]
 [ -17.03834079]
 [  -4.68990608]
 [ -76.69772813]
 [ 311.23945057]
 [-196.37668332]
 [-463.84946074]
 [ 227.48186954]
 [ 206.17335273]
 [-197.49651096]]


In [52]:
class LDA:
    def fit_function(self, feature_matrix, output_matrix):
        logOdds_ratio = self.logOdds(output_matrix)[2]
        means = self.mean_features(feature_matrix, output_matrix, )
        return 0
    
    def logOdds(self, output_matrix):
        positive = 0
        negative = 0
        result = list()
        
        # Counts the number of positives to the number of negatives
        for element in output_matrix:
            if(element[0] > 0): 
                positive += 1
            else: 
                negative += 1
        
        # evaluates probability of the different class to evaluate the log Odds ratio
        probs_positive = (positive + 1)/(positive + negative + 2)
        probs_negative = (negative + 1)/(positive + negative + 2)
        
        result.append(negative)
        result.append(positive)
        result.append(math.log((probs_positive/probs_negative)))
        return result
    
    def mean_features(self, feature_matrix, output_matrix, num_positives, num_negatives):
        output_array = list()
        mean_X1 = np.zeros((np.shape(wine_feature_matrix)[1]))
        mean_X0 = np.zeros((np.shape(wine_feature_matrix)[1]))
        
        for i in range(len(output_matrix)):
            if(element[0] > 0):
                mean_X1 += (1/num_positives * feature_matrix[i])
            else:
                mean_X0 += (1/num_positives * feature_matrix[i])
        output_array.append(mean_X0)
        output_array.append(mean_X1)
        
        return output_array

In [57]:
lda = LDA()
result = lda.logOdds(wine_output_matrix)
print(result) 


[14.8     1.4     0.      3.8     0.152   0.647   1.9956  7.02    1.12
 18.8     2.    ]
[15.8     2.4     1.      4.8     1.152   1.647   2.9956  8.02    2.12
 19.8     3.    ]
[7.9    1.2    0.5    2.4    0.576  0.8235 1.4978 4.01   1.06   9.9
 1.5   ]
[855, 744, 0.13888615776218258]
