In [1]:
## IMPORTS ##
import re
import csv
import pandas as pd
import numpy as np
import sys

from sklearn import metrics, preprocessing, pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
### CONSTANTS ###
DATA_LINE = 15
ORIGINAL_FILENAME = "wine_quality.csv"
CLEANED_FILENAME = "wine_quality_cleaned2.csv"

## DEFAULT K, T, R1, R2 ###
K = 4
T = 5
R1 = 1
R2 = 25

In [3]:
### CLEAN CSV FILE - ONLY RUN ONCE TO GET CLEANED CSV ###
def clean_csv(original_file_path, clean_file_path):
    file = open('{}'.format(ORIGINAL_FILENAME), "r")

    fieldnames = []
    for i in range(0, DATA_LINE-1):
        line = file.readline()
        m = re.search('(?<=@attribute )(.+)(?= \w+)', line)
        if (m):
            fieldnames.append(m.group(0))

    with open('{}'.format(CLEANED_FILENAME), "w", newline="") as clean_csv:
        clean_csv.write(','.join(fieldnames) + "\n")
        for line in file:
            clean_csv.write(file.readline())
    file.close()

In [4]:
### UTILITY FUNCTION ###
def display_group_by(data, col_name):
    z = data.groupby(col_name).count()
    print("Grouped by '{}' :\n".format(col_name), z)

def separate_data_from_label(data):
    X = data.iloc[:, :-1].values
    y = data.iloc[:, 11].values
    return [X, y]


In [5]:
### KNN MODELS ###
def knn(original_data, K, test_ratio=0.20, scaled=False):
    data = original_data.copy()
    [X, y] = separate_data_from_label(data)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0, test_size=test_ratio)
    
    classifier = KNeighborsClassifier(n_neighbors=K)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    print("Result of KNN using K={} is: \n".format(K))
    print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
    print("Classification report: \n", classification_report(y_test, y_pred))
    print("Training accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [6]:
def knn_with_cross_validation(original_data, K, T, test_ratio=0.2, scaled=False):
    data = original_data.copy()
    [X, y] = separate_data_from_label(data)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0, test_size=test_ratio)
    
    classifier = KNeighborsClassifier(n_neighbors=K)
    scores = cross_val_score(classifier, X, y, cv=T)
    print("scores: ", scores)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [7]:
def knn_with_gridSearchCV(original_data, K, T, R1, R2,test_ratio=0.2, scaled=False):
    data = original_data.copy()
    [X, y] = separate_data_from_label(data)
    param_grid = dict(n_neighbors=np.arange(R1, R2+1))  # inclusive

    clf = GridSearchCV(KNeighborsClassifier(), param_grid,
                    cv=T, scoring="accuracy")
    clf.fit(X, y)

    grid_df = pd.DataFrame(clf.cv_results_).sort_values(
        by=['rank_test_score']).set_index('rank_test_score')
    display_cols = ['param_n_neighbors', "mean_test_score", "std_test_score"]
    
    print("Grid result by descending rank test score is: \n{}".format(grid_df[display_cols]))
    print("Best score: ", clf.best_score_)
    print("Best param: ", clf.best_params_)
    print("Best estimator: ", clf.best_estimator_)

    best_line = { key : clf.cv_results_[key][clf.best_index_] for key in clf.cv_results_.keys() }
    print("Best accuracy: %0.2f (+/- %0.2f) when k = %.0f" %
          (best_line['mean_test_score'], best_line['std_test_score'] * 2, best_line['param_n_neighbors']))

In [8]:
## MAIN ##
# Clean CSV - only run once to create a clean CSV file.
# clean_csv(ORIGINAL_FILENAME, CLEANED_FILENAME) 

## WITH FULL DATASETS ##
# Read csv
print("\n== == == == == == == == READ CSV == == == == == == == ==\n")
data = pd.read_csv("wine_quality_cleaned.csv")
display_group_by(data, "quality")

# KNN Models
print("\n== == == == == == == == KNN - Predefined K = {} == == == == == == == ==\n".format(K))
knn(data, K=K)

print("\n== == == == == == == == KNN - Predefined K = {} + Cross Validation == == == == == == == ==\n".format(K))
knn_with_cross_validation(data, K=K, T=T)

print("\n== == == == == == == == KNN - GridSearchCV with K = [{},...,{}] == == == == == == == ==\n".format(R1, R2))
knn_with_gridSearchCV(data, K=K, T=T, R1=R1, R2=R2)


== == == == == == == == READ CSV == == == == == == == ==

Grouped by 'quality' :
          fixedacid  volacid  citricacid  residualsugar  chlorides  freesulfur  \
quality                                                                         
0                1        1           1              1          1           1   
3               14       14          14             14         14          14   
4               82       82          82             82         82          82   
5              718      718         718            718        718         718   
6             1101     1101        1101           1101       1101        1101   
7              442      442         442            442        442         442   
8               87       87          87             87         87          87   
9                4        4           4              4          4           4   
11               1        1           1              1          1           1   

         totalsulfur  den



Grid result by descending rank test score is: 
                param_n_neighbors  mean_test_score  std_test_score
rank_test_score                                                   
1                              25         0.447755        0.020341
2                               1         0.443673        0.022685
3                              17         0.437551        0.026761
4                              24         0.437143        0.024565
5                              23         0.435102        0.023254
6                              18         0.434694        0.022896
6                              16         0.434694        0.019856
8                              19         0.433878        0.019987
9                              11         0.432245        0.027673
10                             14         0.430612        0.018633
11                             21         0.429796        0.028390
12                             15         0.429388        0.016799
13             

In [9]:
## WITH FILTERED DATASET - REMOVE LABEL WITH ONLY 1 MEMBER ##
data = pd.read_csv("wine_quality_cleaned.csv")
data = data[data.groupby('quality').quality.transform('count')>2].copy() # Remove quality that has count < 2
display_group_by(data, "quality")

# KNN Models
print("\n== == == == == == == == KNN - Predefined K = {} == == == == == == == ==\n".format(K))
knn(data, K=K)

print("\n== == == == == == == == KNN - Predefined K = {} + Cross Validation == == == == == == == ==\n".format(K))
knn_with_cross_validation(data, K=K, T=T)

print("\n== == == == == == == == KNN - GridSearchCV with K = [{},...,{}] == == == == == == == ==\n".format(R1, R2))
knn_with_gridSearchCV(data, K=K, T=T, R1=R1, R2=R2)

Grouped by 'quality' :
          fixedacid  volacid  citricacid  residualsugar  chlorides  freesulfur  \
quality                                                                         
3               14       14          14             14         14          14   
4               82       82          82             82         82          82   
5              718      718         718            718        718         718   
6             1101     1101        1101           1101       1101        1101   
7              442      442         442            442        442         442   
8               87       87          87             87         87          87   
9                4        4           4              4          4           4   

         totalsulfur  density    pH  sulphates  alcohol  
quality                                                  
3                 14       14    14         14       14  
4                 82       82    82         82       82  
5             



Grid result by descending rank test score is: 
                param_n_neighbors  mean_test_score  std_test_score
rank_test_score                                                   
1                              25         0.448529        0.020437
2                               1         0.444036        0.022948
3                              17         0.438725        0.027021
4                              24         0.437908        0.024540
5                              23         0.435458        0.023048
6                              18         0.435049        0.022571
7                              16         0.434641        0.019548
8                              19         0.434232        0.019592
9                              11         0.432598        0.027541
10                             14         0.430964        0.018802
10                             21         0.430964        0.028428
12                             15         0.429739        0.016629
13             