In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
np.random.seed(42)

In [22]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv",sep = ';')
y = df.pop('quality')

In [23]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [36]:
y.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [24]:
df.shape

(4898, 11)

In [25]:
df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [26]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))

In [27]:
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size = 0.2)

In [56]:
def model_fit(alg, train_x, test_x, train_y, test_y, if_cv = True, cv_folds = 5):
    alg.fit(train_x, train_y)
    
    #performing Cross Validation
    if if_cv: 
        cv_score = cross_val_score(alg, train_x, train_y, cv = cv_folds, scoring = 'f1_macro')
    
    predictions = alg.predict(test_x)
    
    #Print 
    print("\n ************************* Model Report ************************* \n")
    print(classification_report(test_y, predictions))
    
    if if_cv: 
        print("CV REPORT :- Mean - %.3g | Std - %.3g | Min - %.3g | Max - %.3g"%(np.mean(cv_score),
                                                                                 np.std(cv_score),
                                                                                 np.min(cv_score),
                                                                                 np.max(cv_score)))
    
    print("Accuracy: ", accuracy_score(test_y, predictions))
    print('-'*100)
    
    cm = pd.DataFrame(confusion_matrix(test_y, predictions, labels = [6, 5, 7, 8, 4, 3, 9]))
    return cm

In [57]:
lr = LogisticRegression()
# labels = lr.classes_
cm = model_fit(lr, train_x, test_x, train_y, test_y)
print("\n ************************* Confusion matrix ************************* \n")
print(cm)


 ************************* Model Report ************************* 

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        39
           5       0.46      0.32      0.38       290
           6       0.49      0.83      0.62       450
           7       0.56      0.05      0.10       171
           8       0.00      0.00      0.00        25
           9       0.00      0.00      0.00         1

    accuracy                           0.48       980
   macro avg       0.22      0.17      0.16       980
weighted avg       0.46      0.48      0.41       980

CV REPORT :- Mean - 0.157 | Std - 0.0142 | Min - 0.142 | Max - 0.183
Accuracy:  0.4846938775510204
----------------------------------------------------------------------------------------------------

 ************************* Confusion matrix ************************* 

     0   1  2  3  4  5  6
0  372  75  3  0  0  0  0
1  195  9

In [58]:
def KNN_predicts(train_x, test_x, train_y, test_y, scaler, neighbours, metric = 'manhattan', weights = 'uniform'):
    
    train_scaled = scaler.fit_transform(train_x)
    test_scaled = scaler.transform(test_x)
    
    KNN = KNeighborsClassifier(n_neighbors = neighbours, metric = metric, weights = weights, n_jobs = -1)
    KNN.fit(train_scaled, train_y)
    predictions = KNN.predict(test_scaled)  
   
    print("Accuracy: ", accuracy_score(test_y, predictions))
    print('-'*100)
    return KNN

In [59]:
KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), 1)

Accuracy:  0.636734693877551
----------------------------------------------------------------------------------------------------


KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=1)

## Neighbours tuning

In [60]:
for k in range(1,11):
    print('Accuracy score on kNN using n_neighbours = {0}:'.format(2**k), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), 2**k)

Accuracy score on kNN using n_neighbours = 2: Accuracy:  0.5663265306122449
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 4: Accuracy:  0.5489795918367347
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 8: Accuracy:  0.5551020408163265
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 16: Accuracy:  0.5408163265306123
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 32: Accuracy:  0.5183673469387755
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = 64: Accuracy:  0.5295918367346939
------------------------------------

## Metric tuning

In [61]:
k = 4
for metric in ['euclidean', 'minkowski', 'manhattan', 'chebyshev']:
    print('Accuracy score on kNN using {} metric and {} neighbours:'.format(metric, k), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), k, metric)

Accuracy score on kNN using euclidean metric and 4 neighbours: Accuracy:  0.5551020408163265
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using minkowski metric and 4 neighbours: Accuracy:  0.5551020408163265
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using manhattan metric and 4 neighbours: Accuracy:  0.5489795918367347
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using chebyshev metric and 4 neighbours: Accuracy:  0.5387755102040817
----------------------------------------------------------------------------------------------------


## Weighted kNN tuning

'uniform' : uniform weights. All points in each neighborhood are weighted equally.

'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

In [62]:
for weight in ['uniform','distance']:
    print('Accuracy score on kNN using n_neighbours = {0}:'.format(weight), end = ' ')
    KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), 4, metric ='manhattan', weights = weight)

Accuracy score on kNN using n_neighbours = uniform: Accuracy:  0.5489795918367347
----------------------------------------------------------------------------------------------------
Accuracy score on kNN using n_neighbours = distance: Accuracy:  0.6448979591836734
----------------------------------------------------------------------------------------------------


## Feature Engineering

In [65]:
def create_poly(train_x, test_x, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train_x)
    test_poly = poly.fit_transform(test_x)
    return train_poly, test_poly

In [66]:
for degree in [1, 2, 3]:
    train_poly, test_poly = create_poly(train_x, test_x, degree)
    print('Polynomial degree', degree)
    Knn = KNN_predicts(train_poly, test_poly, train_y, test_y, StandardScaler(), 4, metric ='manhattan', weights = 'distance')

Polynomial degree 1
Accuracy:  0.6448979591836734
----------------------------------------------------------------------------------------------------
Polynomial degree 2
Accuracy:  0.6459183673469387
----------------------------------------------------------------------------------------------------
Polynomial degree 3
Accuracy:  0.6530612244897959
----------------------------------------------------------------------------------------------------


In [67]:
Knn

KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=4,
                     weights='distance')

In [68]:
train_poly, test_poly = create_poly(train_x, test_x, 2)

## Model pickling

In [105]:
# from sklearn.externals import joblib 
import joblib
joblib.dump(Knn,'KNNModel.pkl')

['KNNModel.pkl']

In [109]:
knn_new_model = joblib.load('KNNModel.pkl')

In [110]:
def feature_eng(df):
    df['feat1'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['feat2'] = df['pH'] * df['fixed acidity']
    return df

In [111]:
train_feat = feature_eng(train_x)
test_feat = feature_eng(test_x)


In [113]:
print("The score after feature engineering: ")
KNN_predicts(train_x, test_x, train_y, test_y, StandardScaler(), 4, metric ='manhattan',weights = 'distance')

The score after feature engineering: 
Accuracy:  0.676530612244898
----------------------------------------------------------------------------------------------------


KNeighborsClassifier(metric='manhattan', n_jobs=-1, n_neighbors=4,
                     weights='distance')