<h2>Initializing</h2>
<p>Import Libraries, add variables for attribute strings (to save '') and load data</p>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix 
from sklearn import tree
from sklearn.metrics import classification_report
from scipy import interp
from sklearn.metrics import accuracy_score
from sklearn.utils.multiclass import unique_labels
import math

fixed_acidity = 'fixed acidity'
volatile_acidity = 'volatile acidity'
citric_acid = 'citric acid'
residual_sugar = 'residual sugar'
chlorides = 'chlorides'
free_sulfur_dioxide = 'free sulfur dioxide'
total_sulfur_dioxide = 'total sulfur dioxide'
density = 'density'
ph = 'pH'
sulphates = 'sulphates'
alcohol = 'alcohol'
quality = 'quality'
qclass = 'qclass'
ratio_to_fixed = 'ratio_to_fixed'
ratio_to_volatile = 'ratio_to_volatile'
ph_acidity = 'ph_acidity'
ratio_to_ph = 'ratio_to_ph'

In [2]:
def confusion_matrix_report(y_true, y_pred):
    cm, labels = confusion_matrix(y_true, y_pred), unique_labels(y_true, y_pred)
    column_width = max([len(str(x)) for x in labels] + [5])  # 5 is value length
    report = " " * column_width + " " + "{:_^{}}".format("Prediction", column_width * len(labels))+ "\n"
    report += " " * column_width + " ".join(["{:>{}}".format(label, column_width) for label in labels]) + "\n"
    for i, label1 in enumerate(labels):
        report += "{:>{}}".format(label1, column_width) + " ".join(["{:{}d}".format(cm[i, j], column_width) for j in range(len(labels))]) + "\n"
    return report
def root_mean_squared_error(y_true, y_pred):
        return math.sqrt(math.mean(math.square(y_pred - y_true), axis=-1)) 

<p>Import red and white dataset.<br/>
Split into input matrix (independent vars) and output vector (target vars).<br/>
</p>

<p>sort for kNN (https://scikit-learn.org/stable/modules/neighbors.html#unsupervised-nearest-neighbors) </p>

In [3]:
white = pd.read_csv("data/winequality-white1.csv")
red = pd.read_csv("data/winequality-red1.csv")
white.sort_values(by='quality',inplace=True)
red.sort_values(by='quality',inplace=True)
white_target = white[quality]
red_target = red[quality]
white_input = white.drop(quality,axis=1)
red_input = red.drop(quality,axis=1)

Add ratio of acidity to sugar as attribute.

In [4]:
#white[ph_acidity] = 7 - white[ph]
#red[ph_acidity] = 7 - red[ph]
#white_input[ratio_to_fixed]=white[residual_sugar]/white[fixed_acidity]
#white_input[ratio_to_volatile]=white[residual_sugar]/white[volatile_acidity]
#white_input[ratio_to_ph]=white[residual_sugar]/white[ph_acidity]
#red_input[ratio_to_fixed]=red[residual_sugar]/red[fixed_acidity]
#red_input[ratio_to_volatile]=red[residual_sugar]/red[volatile_acidity]
#red_input[ratio_to_ph]=red[residual_sugar]/red[ph_acidity]

<p>using log.</p>

In [5]:
log_white_norm=np.log(white_input+1)
log_red_norm=np.log(red_input+1)
f_log_white=log_white_norm.drop([fixed_acidity,sulphates,chlorides,citric_acid,ph,density,total_sulfur_dioxide,residual_sugar],axis=1)
f_log_red=log_red_norm.drop([free_sulfur_dioxide,ph,residual_sugar,chlorides,citric_acid,fixed_acidity,density],axis=1)

<p>Normalize using MinMax Normalizer.</p>

In [6]:
mm_white_norm=(white_input-white_input.min())/(white_input.max()-white_input.min())
mm_red_norm=(red_input-red_input.min())/(red_input.max()-red_input.min())
f_mm_white = mm_white_norm.drop([fixed_acidity,sulphates,chlorides,citric_acid,ph,density,total_sulfur_dioxide,residual_sugar],axis=1)
f_mm_red = mm_red_norm.drop([free_sulfur_dioxide,ph,residual_sugar,chlorides,citric_acid,fixed_acidity,density],axis=1)

<p>Normalize using Mean Normalizer.</p>

In [7]:
white_norm=(white_input-white_input.mean())/(white_input.std())
red_norm=(red_input-red_input.mean())/(red_input.std())

<p>Correlation Matrices</p>

In [8]:
wcorr = white.corr()
wcorr.style.background_gradient().set_precision(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.0227,0.289,0.089,0.0231,-0.0494,0.0911,0.265,-0.426,-0.0171,-0.121,-0.114
volatile acidity,-0.0227,1.0,-0.149,0.0643,0.0705,-0.097,0.0893,0.0271,-0.0319,-0.0357,0.0677,-0.195
citric acid,0.289,-0.149,1.0,0.0942,0.114,0.0941,0.121,0.15,-0.164,0.0623,-0.0757,-0.00921
residual sugar,0.089,0.0643,0.0942,1.0,0.0887,0.299,0.401,0.839,-0.194,-0.0267,-0.451,-0.0976
chlorides,0.0231,0.0705,0.114,0.0887,1.0,0.101,0.199,0.257,-0.0904,0.0168,-0.36,-0.21
free sulfur dioxide,-0.0494,-0.097,0.0941,0.299,0.101,1.0,0.616,0.294,-0.000618,0.0592,-0.25,0.00816
total sulfur dioxide,0.0911,0.0893,0.121,0.401,0.199,0.616,1.0,0.53,0.00232,0.135,-0.449,-0.175
density,0.265,0.0271,0.15,0.839,0.257,0.294,0.53,1.0,-0.0936,0.0745,-0.78,-0.307
pH,-0.426,-0.0319,-0.164,-0.194,-0.0904,-0.000618,0.00232,-0.0936,1.0,0.156,0.121,0.0994
sulphates,-0.0171,-0.0357,0.0623,-0.0267,0.0168,0.0592,0.135,0.0745,0.156,1.0,-0.0174,0.0537


In [9]:
rcorr = red.corr()
rcorr.style.background_gradient().set_precision(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256,0.672,0.115,0.0937,-0.154,-0.113,0.668,-0.683,0.183,-0.0617,0.124
volatile acidity,-0.256,1.0,-0.552,0.00192,0.0613,-0.0105,0.0765,0.022,0.235,-0.261,-0.202,-0.391
citric acid,0.672,-0.552,1.0,0.144,0.204,-0.061,0.0355,0.365,-0.542,0.313,0.11,0.226
residual sugar,0.115,0.00192,0.144,1.0,0.0556,0.187,0.203,0.355,-0.0857,0.00553,0.0421,0.0137
chlorides,0.0937,0.0613,0.204,0.0556,1.0,0.00556,0.0474,0.201,-0.265,0.371,-0.221,-0.129
free sulfur dioxide,-0.154,-0.0105,-0.061,0.187,0.00556,1.0,0.668,-0.0219,0.0704,0.0517,-0.0694,-0.0507
total sulfur dioxide,-0.113,0.0765,0.0355,0.203,0.0474,0.668,1.0,0.0713,-0.0665,0.0429,-0.206,-0.185
density,0.668,0.022,0.365,0.355,0.201,-0.0219,0.0713,1.0,-0.342,0.149,-0.496,-0.175
pH,-0.683,0.235,-0.542,-0.0857,-0.265,0.0704,-0.0665,-0.342,1.0,-0.197,0.206,-0.0577
sulphates,0.183,-0.261,0.313,0.00553,0.371,0.0517,0.0429,0.149,-0.197,1.0,0.0936,0.251


<h2>Assigning classes</h2>
<p>
    Assign classes based on quality. Less than 6; 6; better than 6. Because it makes sense considering the distribution of ratings.
</p>

In [10]:
#Assign classes for white
conditions = [(white['quality'] < 6),
              (white['quality'] > 6)]
choices = ['2_low', '1_high']
white['class'] = np.select(conditions, choices, default = '3_medium')
choices = [1,3]
white['classnum'] = np.select(conditions, choices, default = 2)
white_classnum = white['classnum']
white.drop('classnum',axis=1,inplace=True)
#Assign classes for red
conditions = [(red['quality'] < 6),
              (red['quality'] > 6)]
choices = ['3_low', '1_high']
red['class'] = np.select(conditions, choices, default = '2_medium')
choices = [1,3]
red['classnum'] = np.select(conditions, choices, default = 2)
red_classnum = red['classnum']
red.drop('classnum',axis=1,inplace=True)

#print('class distribution fot white:')
#print(white['class'].value_counts())
#print('\n class distribution for red:')
#print(red['class'].value_counts())

white_norm['class']=white['class']
red_norm['class']=red['class']

white_targetclass = white_classnum
red_targetclass = red_classnum
print(white_targetclass.value_counts())
print(red_targetclass.value_counts())
#white_targetclass = white['class']
#red_targetclass = red['class']

2    2198
1    1640
3    1060
Name: classnum, dtype: int64
1    744
2    638
3    217
Name: classnum, dtype: int64


In [11]:
white_norm_input = white_norm.drop('class', axis = 1)
red_norm_input = red_norm.drop('class', axis = 1)

In [12]:
#white_norm.loc[white_norm['class']=='3_high'].describe()
#white_norm.loc[white_norm['class']=='2_medium'].describe()
#white_norm.loc[white_norm['class']=='1_low'].describe()
white_norm_filtered = white_norm_input.drop([fixed_acidity,sulphates,chlorides,citric_acid,ph,density,total_sulfur_dioxide,residual_sugar],axis=1)
white_norm_filtered.describe()

Unnamed: 0,volatile acidity,free sulfur dioxide,alcohol
count,4898.0,4898.0,4898.0
mean,-1.998539e-14,-6.380722e-17,-4.325046e-14
std,1.0,1.0,1.0
min,-1.966784,-1.958477,-2.043089
25%,-0.6770318,-0.7237012,-0.8241915
50%,-0.1809733,-0.07691388,-0.09285319
75%,0.414297,0.6286722,0.719745
max,8.152811,14.91679,2.99502


In [13]:
#red_norm.loc[red_norm['class']=='3_high'].describe()
#red_norm.loc[red_norm['class']=='2_medium'].describe()
#red_norm.loc[red_norm['class']=='1_low'].describe()
red_norm_filtered = red_norm_input.drop([free_sulfur_dioxide,ph,residual_sugar,chlorides,citric_acid,fixed_acidity,density],axis=1)
red_norm_filtered.describe()

Unnamed: 0,volatile acidity,total sulfur dioxide,sulphates,alcohol
count,1599.0,1599.0,1599.0,1599.0
mean,8.477896e-15,2.36556e-16,6.625737e-15,2.204136e-14
std,1.0,1.0,1.0,1.0
min,-2.277567,-1.230199,-1.935902,-1.898325
25%,-0.7696903,-0.7438076,-0.63802,-0.8661079
50%,-0.04367545,-0.2574163,-0.2250577,-0.2092427
75%,0.6264921,0.4721707,0.4238832,0.6352984
max,5.876138,7.372847,7.9162,4.201138


<h2>k-NN classification<h2><p>https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html#the-curse-of-dimensionality</p>

<h3>white: normalized and selected for attributes with low std/range</h3>

In [14]:
def run_knn_report(inputs, targets):
    print("_______________________________________________________________________________________________")
    print("KNN, no weights")
    for n_neighbour in range(1,21):
        print(str(n_neighbour) + " neighbours:")
        knn_estimator = KNeighborsClassifier(n_neighbour)
        print(str(n_neighbour)+ " neighbours//")
        predicted = cross_val_predict(knn_estimator,inputs,targets,cv=cv)
        print(confusion_matrix_report(targets,predicted))
        print(classification_report(targets,predicted))
        try:
            print("RMSE:")
            print(root_mean_squared_error(targets,predicted))
        except(Error):
            pass
    
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1337)
#cv = KFold(n_splits=3)
run_knn_report(white_norm_filtered.values, white_targetclass)


_______________________________________________________________________________________________
KNN, no weights
1 neighbours:
1 neighbours//
      __Prediction___
         1     2     3
    1 1120   419   101
    2  420  1444   334
    3   84   326   650

             precision    recall  f1-score   support

          1       0.69      0.68      0.69      1640
          2       0.66      0.66      0.66      2198
          3       0.60      0.61      0.61      1060

avg / total       0.66      0.66      0.66      4898

RMSE:


NameError: name 'Error' is not defined

<h3><i>red: </i>normalized and selected for attributes with low std/range</h3>

In [None]:
run_knn_report(red_norm_filtered.values,red_targetclass)

<h3>white: normalized unselected</h3>

In [None]:
run_knn_report(white_norm_input.values,white_targetclass)

<h3><i>red: </i> normalized unselected</h3>

In [None]:
run_knn_report(red_norm_input.values,red_targetclass)

<h3> white: unfiltered, unnormalized <h3>

In [None]:
run_knn_report(white_input.values,white_targetclass)

<h3>red, unnormalized, unselected</h3>

In [None]:
run_knn_report(red_input.values,red_targetclass)

In [None]:
def run_knc_report(inputs, targets):
    knc_estimator = NearestCentroid()
    predicted = cross_val_predict(knc_estimator,inputs,targets,cv=cv)
    print(confusion_matrix_report(targets,predicted))
    print(classification_report(targets,predicted))
    try:
        print("RMSE:")
        print(root_mean_squared_error(targets,predicted))
    except(Error):
        pass

In [None]:
run_knc_report(white_norm_input.values,white_targetclass)

In [None]:
run_knc_report(red_norm_input.values,red_targetclass)

In [None]:
def run_kncs_report(inputs, targets,thresh=0):
    knc_estimator = NearestCentroid(shrink_threshold=thresh)
    predicted = cross_val_predict(knc_estimator,inputs,targets,cv=cv)
    print(confusion_matrix_report(targets,predicted))
    print(classification_report(targets,predicted))

In [None]:
run_kncs_report(white_norm_input.values,white_targetclass)

In [None]:
for t in range (0,1000,100):
    t = t/100000
    print(t)
    run_kncs_report(white_norm_input.values,white_targetclass,t)

In [None]:
def run_knnw_report(inputs, targets):
    print("_______________________________________________________________________________________________")
    print("KNN with inverse distance as weight:")
    for n_neighbour in range(1,21):
        print(str(n_neighbour) + " neighbours:")
        knn_estimator = KNeighborsClassifier(n_neighbour,weights='distance')
        print(str(n_neighbour)+ " neighbours//")
        predicted = cross_val_predict(knn_estimator,inputs,targets,cv=cv)
        print(confusion_matrix_report(targets,predicted))
        print(classification_report(targets,predicted))
        try:
            print("RMSE:")
            print(root_mean_squared_error(targets,predicted))
        except(Error e):
            pass

In [None]:
run_knnw_report(white_norm_input.values,white_targetclass)

In [None]:
run_knnw_report(red_norm_input.values,red_targetclass)

In [None]:
run_knnw_report(log_white_norm.values,white_targetclass)

In [None]:
run_knnw_report(log_red_norm.values,red_targetclass)

In [None]:
run_knnw_report(mm_white_norm.values,white_targetclass)

In [None]:
run_knnw_report(mm_red_norm.values,red_targetclass)

In [None]:
run_knn_report(white_norm_filtered.values,white_targetclass)

In [None]:
run_knnw_report(white_norm_filtered.values,white_targetclass)

In [None]:
run_knn_report(red_norm_filtered.values,red_targetclass)
run_knnw_report(red_norm_filtered.values,red_targetclass)

In [None]:
run_knn_report(f_log_red.values,red_targetclass)
run_knnw_report(f_log_red.values,red_targetclass)

In [None]:
run_knn_report(f_log_white.values,white_targetclass)
run_knnw_report(f_log_white.values,white_targetclass)

In [None]:
run_knn_report(f_mm_red.values,red_targetclass)
run_knnw_report(f_mm_red.values,red_targetclass)

In [None]:
run_knn_report(f_mm_white.values,white_targetclass)
run_knnw_report(f_mm_white.values,white_targetclass)