In [1]:
## IMPORTS ##
import re
import csv
import pandas as pd
import numpy as np
import sys

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
### CONSTANTS ###
DATA_LINE = 15
ORIGINAL_FILENAME = "wine_quality.csv"
CLEANED_FILENAME = "wine_quality_cleaned2.csv"

### SET YOUR K, T, R1, R2 ###
K = 5
T = 5
R1 = 1
R2 = 25

In [3]:
### CLEAN CSV FILE - ONLY RUN ONCE TO GET CLEANED CSV ###
# file = open('{}'.format(ORIGINAL_FILENAME), "r")

# fieldnames = []
# for i in range(0, DATA_LINE-1):
#     line = file.readline()
#     m = re.search('(?<=@attribute )(.+)(?= \w+)', line)
#     if (m):
#         fieldnames.append(m.group(0))

# with open('{}'.format(CLEANED_FILENAME), "w", newline="") as clean_csv:
#     clean_csv.write(','.join(fieldnames) + "\n")
#     for line in file:
#         clean_csv.write(file.readline())

# file.close()

In [4]:
# READ DATA #
data = pd.read_csv("wine_quality_cleaned.csv")
data

Unnamed: 0,fixedacid,volacid,citricacid,residualsugar,chlorides,freesulfur,totalsulfur,density,pH,sulphates,alcohol,quality
0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.90,6
1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.80,6
2,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.99380,3.22,0.45,11.00,6
3,8.6,0.23,0.40,4.2,0.035,17.0,109.0,0.99470,3.14,0.53,9.70,5
4,6.6,0.16,0.40,1.5,0.044,48.0,143.0,0.99120,3.54,0.52,12.40,7
...,...,...,...,...,...,...,...,...,...,...,...,...
2445,6.5,0.33,0.38,8.3,0.035,60.0,148.0,0.98964,3.27,0.35,11.50,11
2446,6.6,0.34,0.40,8.1,0.046,68.0,170.0,0.99494,3.15,0.50,9.55,6
2447,6.8,0.22,0.36,1.2,0.052,38.0,127.0,0.99330,3.04,0.54,9.20,5
2448,6.1,0.34,0.29,2.2,0.036,25.0,100.0,0.98938,3.06,0.44,11.80,6


In [21]:
z=data.groupby("quality").count()
print("Class:\n",z)
X = data.iloc[:, :-1].values
print("X: \n", X)
y = data.iloc[:, 11].values
print("y: \n", y)

Class:
          fixedacid  volacid  citricacid  residualsugar  chlorides  freesulfur  \
quality                                                                         
0                1        1           1              1          1           1   
3               14       14          14             14         14          14   
4               82       82          82             82         82          82   
5              718      718         718            718        718         718   
6             1101     1101        1101           1101       1101        1101   
7              442      442         442            442        442         442   
8               87       87          87             87         87          87   
9                4        4           4              4          4           4   
11               1        1           1              1          1           1   

         totalsulfur  density    pH  sulphates  alcohol  
quality                                   

In [23]:
# KNN model - NO CROSS VALUATION
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.20)

In [7]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [8]:
#Testing kNN Model
y_pred = classifier.predict(X_test)

In [9]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("training accuracy: ",metrics.accuracy_score(y_test, y_pred))

[[  0   1   0   0   0   0]
 [  1   2   8   2   2   0]
 [  0   5  76  58   8   1]
 [  2   6  63 121  25   3]
 [  0   4  22  43  23   0]
 [  0   1   4   7   2   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.11      0.13      0.12        15
           5       0.44      0.51      0.47       148
           6       0.52      0.55      0.54       220
           7       0.38      0.25      0.30        92
           8       0.00      0.00      0.00        14

    accuracy                           0.45       490
   macro avg       0.24      0.24      0.24       490
weighted avg       0.44      0.45      0.44       490

training accuracy:  0.4530612244897959


In [10]:
# KNN model - WITH CROSS VALUATION
scores = cross_val_score(classifier, X, y, cv=T)
print("scores: ", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

scores:  [0.4        0.41260163 0.39059305 0.42827869 0.42592593]
Accuracy: 0.41 (+/- 0.03)




In [11]:
# KNN model - WITH gridSearchCV
param_grid = dict(n_neighbors=np.arange(R1, R2+1)) # inclusive
print("param_grid: ", param_grid)
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(KNeighborsClassifier(), param_grid, cv=T, scoring="accuracy")
print("X: \n", X)
print("y: \n", y)
clf.fit(X, y)

param_grid:  {'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25])}
X: 
 [[ 7.2   0.23  0.32 ...  3.19  0.4   9.9 ]
 [ 7.    0.27  0.36 ...  3.    0.45  8.8 ]
 [ 8.1   0.22  0.43 ...  3.22  0.45 11.  ]
 ...
 [ 6.8   0.22  0.36 ...  3.04  0.54  9.2 ]
 [ 6.1   0.34  0.29 ...  3.06  0.44 11.8 ]
 [ 6.5   0.23  0.38 ...  3.29  0.54  9.7 ]]
y: 
 [6 6 6 ... 5 6 5]




GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [12]:
clf.cv_results_['mean_test_score']

array([0.44367347, 0.42285714, 0.41142857, 0.42      , 0.42081633,
       0.42285714, 0.42122449, 0.42489796, 0.42122449, 0.42612245,
       0.4322449 , 0.42693878, 0.42693878, 0.43061224, 0.42938776,
       0.43469388, 0.43755102, 0.43469388, 0.43387755, 0.42408163,
       0.42979592, 0.42897959, 0.43510204, 0.43714286, 0.4477551 ])

In [20]:
grid_df = pd.DataFrame(clf.cv_results_).sort_values(
        by=['rank_test_score']).set_index('rank_test_score')
display_cols = ['param_n_neighbors', "mean_test_score", "std_test_score"]

grid_df[display_cols]

Unnamed: 0_level_0,param_n_neighbors,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,25,0.447755,0.020341
2,1,0.443673,0.022685
3,17,0.437551,0.026761
4,24,0.437143,0.024565
5,23,0.435102,0.023254
6,18,0.434694,0.022896
6,16,0.434694,0.019856
8,19,0.433878,0.019987
9,11,0.432245,0.027673
10,14,0.430612,0.018633


In [15]:
print("best score: ", clf.best_score_)
print("best param: ", clf.best_params_)
print("best estimator: ", clf.best_estimator_)

best_line = { key : clf.cv_results_[key][clf.best_index_] for key in clf.cv_results_.keys() }
print("Best accuracy: %0.2f (+/- %0.2f) when k = %.0f" %
      (best_line['mean_test_score'], best_line['std_test_score'] * 2, best_line['param_n_neighbors']))

best score:  0.4477551020408163
best param:  {'n_neighbors': 25}
best estimator:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=25, p=2,
                     weights='uniform')
Best accuracy: 0.45 (+/- 0.04) when k = 25
