In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')


In [8]:
np.random.seed(42) #to fix the random values to be  taken in the dataset

In [2]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",sep = ';')


In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [9]:
df['quality'].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [10]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [13]:
y=df.pop('quality')

In [22]:
train_x,test_x,train_y,test_y= train_test_split(df,y,test_size=0.2)

In [26]:
def model_fit(alg, train_x, test_x, train_y, test_y, if_cv = True, cv_folds = 5):
    alg.fit(train_x, train_y)
    
    #Cross-val
    if if_cv:
        cv_score = cross_val_score(alg, train_x, train_y, cv = cv_folds, scoring = 'f1_macro')
    
    prediction = alg.predict(test_x)
    
    if if_cv:
        print('CV report: Mean - %.3g | Std - %.3g | Min - %.3g, Max- %.3g' %(np.mean(cv_score),
                                                                              np.std(cv_score), 
                                                                              np.min(cv_score), 
                                                                              np.max(cv_score)))
    print("Accuracy: ", accuracy_score(test_y, prediction))
    print('-'*100)
    
    cm = pd.DataFrame(confusion_matrix(test_y, prediction))
    print(cm)

    

In [27]:
lm=LogisticRegression()

In [28]:
model_fit(lm,train_x, test_x, train_y, test_y)

CV report: Mean - 0.167 | Std - 0.0287 | Min - 0.145, Max- 0.224
Accuracy:  0.45714285714285713
----------------------------------------------------------------------------------------------------
   0  1   2    3  4  5
0  0  0   3    2  0  0
1  0  0   8   22  0  0
2  0  0  81  204  0  0
3  0  0  72  362  1  0
4  0  0  13  157  5  0
5  0  0   5   45  0  0


In [30]:
def KNN_predicts(train_x, test_x, train_y, test_y, scaler, neighbours, 
                 metric = 'manhattan', 
                 weights = 'uniform'):
    
    train_scaled = scaler.fit_transform(train_x)
    test_scaled = scaler.transform(test_x)
    
    KNN = KNeighborsClassifier(n_neighbors= neighbours, metric = metric, 
                               weights = weights, n_jobs = -1)
    KNN.fit(train_scaled, train_y)
    prediction = KNN.predict(test_scaled)
    
    print('Accuracy: ', accuracy_score(test_y, prediction))
    print('-'*100)
    return KNN


In [32]:
KNN_predicts(train_x, test_x, train_y, test_y,StandardScaler(),1)

Accuracy:  0.6489795918367347
----------------------------------------------------------------------------------------------------


KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=1)

In [34]:
for k in range(1, 11):
    print('Accuracy score on KNN using n_neighbors = {0}:'.format(k), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), k)


Accuracy score on KNN using n_neighbors = 1: Accuracy:  0.6489795918367347
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 2: Accuracy:  0.5816326530612245
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 3: Accuracy:  0.5653061224489796
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 4: Accuracy:  0.5673469387755102
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 5: Accuracy:  0.5581632653061225
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using n_neighbors = 6: Accuracy:  0.5387755102040817
---------------------------------------------

In [38]:
k=5
for metric in ['euclidean','minkowski','manhattan','chebyshev']:
    print('Accuracy score on KNN using {} metric and {} neighbors:'.format(metric,k), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), k,metric)


Accuracy score on KNN using euclidean metric and 5 neighbors: Accuracy:  0.5408163265306123
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using minkowski metric and 5 neighbors: Accuracy:  0.5408163265306123
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using manhattan metric and 5 neighbors: Accuracy:  0.5581632653061225
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using chebyshev metric and 5 neighbors: Accuracy:  0.5336734693877551
----------------------------------------------------------------------------------------------------


In [40]:
for weight in ['uniform','distance']:
    print('Accuracy score on KNN using  neighbors {} weights: '.format(weight), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), 5,'manhattan',weights=weight)

Accuracy score on KNN using  neighbors uniform weights:  Accuracy:  0.5581632653061225
----------------------------------------------------------------------------------------------------
Accuracy score on KNN using  neighbors distance weights:  Accuracy:  0.6581632653061225
----------------------------------------------------------------------------------------------------
