In [1]:
#import libraries 
import numpy as np  
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
#import data 
red_wine = pd.read_csv("wine+quality\winequality-red.csv", sep= ';', delimiter=None) 
white_wine = pd.read_csv("wine+quality\winequality-white.csv", sep= ';', delimiter=None) 

#make a copy
rw = red_wine
ww = white_wine

In [3]:
#remove all duplicates 
rw.drop_duplicates(inplace=True)
ww.drop_duplicates(inplace=True)

In [4]:
#unify column names 
rw.rename(columns = {'fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity','citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide', 'total sulfur dioxide':'total_sulfur_dioxide'}, inplace=True)
ww.rename(columns = {'fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity','citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide', 'total sulfur dioxide':'total_sulfur_dioxide'}, inplace=True)

In [5]:
#add a column 
rw.insert(12, "colour", 'red')
ww.insert(12, "colour", 'white')
#could also do rw["colour"] = "red"

In [6]:
#combine data sets 
data = [rw, ww] #its placing data ets on top of each other = ignor_index 
df_clean = pd.concat(data, ignore_index=True)
df_clean

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,colour
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [7]:
df_clean.shape

(5320, 13)

In [8]:
#check colour column, data type 
df_clean["colour"].unique()

array(['red', 'white'], dtype=object)

In [9]:
#use One-Hot Encoder to convert the colour column from categorical str object to readable numbers 
#automatic assigning is 1 
df_clean["colour"] = (df_clean["colour"] == "red").astype(int)

In [10]:
df_clean.colour.value_counts()

0    3961
1    1359
Name: colour, dtype: int64

In [11]:
mlm1 = df_clean.copy()
mlm1

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,colour
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,1
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0


Split Data

In [12]:
#split data using numpy np.split
#to split by 3 instead of 2 
train, valid, test = np.split(mlm1.sample(frac=1), [int(0.6*len(mlm1)), int(0.8*len(mlm1))])
#.sample will shuffle my data 

In [13]:
#fix the scale of all my numbers 
def scale_dataset(mlm1, oversample=False):
    X = mlm1[mlm1.columns[:-1]].values #calling all columns until the last one 
    y = mlm1[mlm1.columns[-1]].values #calling only the last column 

    scaler = StandardScaler() # check rules here for group splitting 
    X = scaler.fit_transform(X) 

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    # y is only 2D, so call numpy reshape to make it  3Dimensional Item 
    # because y is only 1 column

    return data, X, y

In [14]:
train, X_train, y_train = scale_dataset(train)
valid, X_valid, y_valid = scale_dataset(valid)
test, X_test, y_test = scale_dataset(test)

In [15]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [16]:
y_pred = knn_model.predict(X_test)

In [17]:
#check classification report 
print(classification_report(y_test, y_pred)) 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       794
           1       0.98      0.98      0.98       270

    accuracy                           0.99      1064
   macro avg       0.99      0.99      0.99      1064
weighted avg       0.99      0.99      0.99      1064



In [18]:
# do a confusion matrix  
#print(confusion_matrix(y_test, y_pred))
#google explaination

[[788   6]
 [  6 264]]


In [19]:
cm = (confusion_matrix(y_test, y_pred))