In [1]:
#import libraries 
import numpy as np  
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.preprocessing import StandardScaler
#from imblearn.over_sampling import RandomOverSampler 
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
#import data 
red_wine = pd.read_csv("wine+quality\winequality-red.csv", sep= ';', delimiter=None) 

#make a copy
rw = red_wine


In [3]:
#remove all duplicates 
rw.drop_duplicates(inplace=True)


In [4]:
#unify column names 
rw.rename(columns = {'fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity','citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide', 'total sulfur dioxide':'total_sulfur_dioxide'}, inplace=True)

In [5]:
#add a column 
#rw.insert(12, "colour", 'red')

In [6]:
#add quality label column - low 1-4, medium 5-6, high 7-10 
rw['quality_label'] = rw['quality'].apply(lambda value: 'low'
if value <= 4 else 'medium'
if value <= 6 else 'high')
rw['quality_label'] = pd.Categorical(rw['quality_label'], 
categories=['low', 'medium', 'high']) 


In [7]:
#check unique values of this column 
rw['quality_label'].unique()

['medium', 'high', 'low']
Categories (3, object): ['low', 'medium', 'high']

In [8]:
#see data distribution 
rw['quality_label'].value_counts()

medium    1112
high       184
low         63
Name: quality_label, dtype: int64

Label Encoder 

In [9]:
#make a copy 
rw1 = rw.copy()

In [10]:
#first save label encoder 
lbl_enc = LabelEncoder()

#use function fit transform 
rw1['quality_label'] = lbl_enc.fit_transform(rw1['quality_label'])



In [11]:
rw1['quality_label'].value_counts()

2    1112
0     184
1      63
Name: quality_label, dtype: int64

Split Data

In [12]:
train, valid, test = np.split(rw1.sample(frac=1), [int(0.6*len(rw1)), int(0.8*len(rw1))])

In [13]:
def scale_dataset(rw1, oversample=False):
    X = rw1[rw1.columns[:-1]].values #calling all columns until the last one 
    y = rw1[rw1.columns[-1]].values #calling only the last column 

    scaler = StandardScaler() # check rules here for group splitting 
    X = scaler.fit_transform(X) 

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    # y is only 2D, so call numpy reshape to make it  3Dimensional Item 
    # because y is only 1 column

    return data, X, y

In [14]:
train, X_train, y_train = scale_dataset(train)
valid, X_valid, y_valid = scale_dataset(valid)
test, X_test, y_test = scale_dataset(test)

In [15]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [16]:
y_pred = knn_model.predict(X_test)

In [17]:
#check classification report 
print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.97      0.74      0.84        47
           1       0.78      0.50      0.61        14
           2       0.92      0.99      0.95       211

    accuracy                           0.92       272
   macro avg       0.89      0.74      0.80       272
weighted avg       0.92      0.92      0.91       272



In [18]:
# do a confusion matrix  
print(confusion_matrix(y_test, y_pred))
#google explaination

[[ 35   0  12]
 [  0   7   7]
 [  1   2 208]]
