In [11]:
import numpy as np
import matplotlib.pyplot as plt
import shapely
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

In [12]:
#Loading the wine data and adding names of columns
data = pd.read_table('wine.data', sep=",", header=None, index_col = None 
                   ,names = ["Class", "Alcohol", "Malic acid", "Ash"
                    ,"Alcalinity of ash","Magnesium","Total phenols"
                    ,"Flavanoids","Nonflavanoid phenols","Proanthocyanins"
                    ,"Color Intensity","Hue","OD280/OD315 of diluted wines","Proline"]);

n_max = len(data.columns) - 1 

#Function for choosing number of columns for analysis
def set_X(n):
    X = data.iloc[:,1:n+1]
    return X 
#Vector of classes 
y = data.iloc[:,0]

In [13]:
#LDA, QDA, NB functions (accuracy of classifier, and confusion matrix)

def LDA(n):
    lda = LinearDiscriminantAnalysis()
    lda.fit(set_X(n),y)
    pred_lda = lda.predict(set_X(n))
    a_all = accuracy_score(y,pred_lda)
    cm_all = confusion_matrix(y,pred_lda)
    print("LDA")
    if n < n_max:
        print("For", n, "columns")
    elif n > 13: 
        n = n_max
        print("For all parameters:","(",n_max,")")
    print("Accuracy:",a_all)
    print("Confusion matrix: \n",cm_all)
    print("\n")
    
def QDA(n):
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(set_X(n),y)
    pred_qda = qda.predict(set_X(n))
    a_all = accuracy_score(y,pred_qda)
    cm_all = confusion_matrix(y,pred_qda)
    print("QDA")
    if n < n_max:
        print("For", n, "columns")
    elif n_max > 13: 
        n = n_max
        print("For all parameters:","(",n_max,")")
    print("Accuracy:",a_all)
    print("Confusion matrix: \n",cm_all)
    print("\n")

def NB(n):
    nb = GaussianNB()
    nb.fit(set_X(n),y)
    pred_nb = nb.predict(set_X(n))
    a_all = accuracy_score(y,pred_nb)
    cm_all = confusion_matrix(y,pred_nb)
    print("NB")
    if n < n_max:
        print("For", n, "columns")
    elif n_max > 13: 
        n = n_max
        print("For all parameters:","(",n_max,")")
    print("Accuracy:",a_all)
    print("Confusion matrix: \n",cm_all)
    print("\n")
    

In [14]:
#Classifiers for all methods with different number of parameters that was taken into account
LDA(13)
QDA(13)
NB(13)

LDA(2)
QDA(2)
NB(2)

LDA(5)
QDA(5)
NB(5)

LDA(10)
QDA(10)
NB(10)

LDA
Accuracy: 1.0
Confusion matrix: 
 [[59  0  0]
 [ 0 71  0]
 [ 0  0 48]]


QDA
Accuracy: 0.9943820224719101
Confusion matrix: 
 [[59  0  0]
 [ 1 70  0]
 [ 0  0 48]]


NB
Accuracy: 0.9887640449438202
Confusion matrix: 
 [[58  1  0]
 [ 0 70  1]
 [ 0  0 48]]


LDA
For 2 columns
Accuracy: 0.8089887640449438
Confusion matrix: 
 [[51  1  7]
 [ 5 61  5]
 [ 7  9 32]]


QDA
For 2 columns
Accuracy: 0.8146067415730337
Confusion matrix: 
 [[52  1  6]
 [ 4 62  5]
 [ 7 10 31]]


NB
For 2 columns
Accuracy: 0.8089887640449438
Confusion matrix: 
 [[51  1  7]
 [ 4 62  5]
 [ 7 10 31]]


LDA
For 5 columns
Accuracy: 0.8764044943820225
Confusion matrix: 
 [[54  1  4]
 [ 1 65  5]
 [ 3  8 37]]


QDA
For 5 columns
Accuracy: 0.8876404494382022
Confusion matrix: 
 [[53  1  5]
 [ 1 65  5]
 [ 2  6 40]]


NB
For 5 columns
Accuracy: 0.8539325842696629
Confusion matrix: 
 [[52  3  4]
 [ 2 62  7]
 [ 3  7 38]]


LDA
For 10 columns
Accuracy: 0.9887640449438202
Confusion matrix: 
 [[59  0  0]
 [ 1 70  0]
 [ 0  1 47]]



Accuracies of classifiers are almost the same for analysis with the same number of parameters. If we have more parameters in calculation then accuracy of classifier is higher. 

In [15]:
#Splitting data into two groups(training set(PU) and test set(PT)) which are in relation 75/25 
X_PU, X_PT, y_PU, y_PT = train_test_split(ustal_X(2), y, test_size=0.25, random_state=1)
#In need to make validation set(PW) we need to split our training set
X_PU, X_PW, y_PU, y_PW = train_test_split(ustal_X(2), y, test_size=0.33, random_state=1)
#Otrzymujemy PU/PW/PT w stosunku 50/25/25 
#Now we have sets PU/PW/PT in relation 50/25/25 

#For PU we crate classifiers that will be validated by PW 
#based on accuracies for all the methods we gonna choose the best classifier
print("Accuracy:")
lda = LinearDiscriminantAnalysis()
lda.fit(X_PU,y_PU)
pred_lda = lda.predict(X_PW)
a_lda = accuracy_score(y_PW,pred_lda)
print("LDA:",a_lda)

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_PU,y_PU)
pred_qda = qda.predict(X_PW)
a_qda = accuracy_score(y_PW,pred_qda)
print("QDA:",a_qda)

nb = GaussianNB()
nb.fit(X_PU,y_PU)
pred_nb = nb.predict(X_PW)
a_nb = accuracy_score(y_PW,pred_nb)
print("NB:",a_nb)

Accuracy:
LDA: 0.8305084745762712
QDA: 0.864406779661017
NB: 0.847457627118644


PW showed that QDA classifier has the highest accuracy. We can expect analysis on PT will have highest accuracy on QDA classifier. 

In [16]:
print("Accuracy:")
lda = LinearDiscriminantAnalysis()
lda.fit(X_PU,y_PU)
pred_lda = lda.predict(X_PT)
print("LDA:",accuracy_score(y_PT,pred_lda))

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_PU,y_PU)
pred_qda = qda.predict(X_PT)
print("QDA:",accuracy_score(y_PT,pred_qda))

nb = GaussianNB()
nb.fit(X_PU,y_PU)
pred_nb = nb.predict(X_PT)
print("NB:",accuracy_score(y_PT,pred_nb))

Accuracy:
LDA: 0.8444444444444444
QDA: 0.8666666666666667
NB: 0.8444444444444444


In [17]:
#LDA for two first parameters 
lda = LinearDiscriminantAnalysis()
lda.fit(set_X(2),y)

#A ten-fold [cv = 10] cross-validation of the above
#the following function divides the original data into 10 subsets
#successively, each of these sets is taken as PT and the rest as PU

y_pred = cross_val_predict(lda, set_X(2), y, cv=10)
a_kw = accuracy_score(y, y_pred)
print("Accuracy: ",a_kw)

Accuracy:  0.7640449438202247
