In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [8]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

In [9]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))

In [10]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
dtype: int64

In [11]:
df[:3]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1


In [12]:
y.nunique()

7

In [13]:
#Train test split

x_train, x_test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [14]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.45918367346938777


In [15]:
def fit_predict(x_train, x_test, y_train, y_test, scaler, 
                n_neighbours, metric = 'manhattan', weights = 'uniform'):
    train_scaled = scaler.fit_transform(x_train)
    test_scaled = scaler.transform(x_test)        
    knn = KNeighborsClassifier(n_neighbors=n_neighbours, metric=metric, 
                               weights=weights, n_jobs = 4)
    knn.fit(train_scaled, y_train)
    y_pred = knn.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

## Neighbours tuning

In [16]:
for k in range(1,11):
    print('Accuracy score on kNN using n_neighbours = {0}:'.format(2**k), end = ' ')
    fit_predict(x_train, x_test, y_train, y_test, StandardScaler(), 2**k)

Accuracy score on kNN using n_neighbours = 2: 0.5724489795918367
Accuracy score on kNN using n_neighbours = 4: 0.5551020408163265
Accuracy score on kNN using n_neighbours = 8: 0.5438775510204081
Accuracy score on kNN using n_neighbours = 16: 0.5418367346938775
Accuracy score on kNN using n_neighbours = 32: 0.5520408163265306
Accuracy score on kNN using n_neighbours = 64: 0.5387755102040817
Accuracy score on kNN using n_neighbours = 128: 0.5295918367346939
Accuracy score on kNN using n_neighbours = 256: 0.5163265306122449
Accuracy score on kNN using n_neighbours = 512: 0.5040816326530613
Accuracy score on kNN using n_neighbours = 1024: 0.47244897959183674


## Metric tuning

In [17]:
k=2
for metric in ['euclidean', 'cosine', 'manhattan', 'chebyshev']:
    print('Accuracy score on kNN using {} metric and {} neighbours:'.format(metric,k), end = ' ')
    fit_predict(x_train, x_test, y_train, y_test, StandardScaler(), 2, metric)

Accuracy score on kNN using euclidean metric and 2 neighbours: 0.573469387755102
Accuracy score on kNN using cosine metric and 2 neighbours: 0.5510204081632653
Accuracy score on kNN using manhattan metric and 2 neighbours: 0.5724489795918367
Accuracy score on kNN using chebyshev metric and 2 neighbours: 0.5704081632653061


# Weighted kNN
'uniform' : uniform weights. All points in each neighborhood are weighted equally.

'distance' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

In [18]:
for weights in ['uniform', 'distance']:
    print('Accuracy score on kNN using weights = {0}:'.format(weights), end = ' ')
    fit_predict(x_train, x_test, y_train, y_test, StandardScaler(), 2, 'chebyshev', weights = weights)

Accuracy score on kNN using weights = uniform: 0.5704081632653061
Accuracy score on kNN using weights = distance: 0.6489795918367347


## Feature Engineering

In [19]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] / df['sulphates']
    return df

x_train = feat_eng(x_train)
x_test = feat_eng(x_test)

In [20]:
print('Accuracy score after engineering:', end = ' ')
fit_predict(x_train, x_test, y_train, y_test, StandardScaler(), 2, 'chebyshev', weights = 'distance')

Accuracy score after engineering: 0.6704081632653062


In [21]:
original_score = 0.514285714286
best_score = 0.670408163265
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 30.36 %


In [22]:
KNeighborsClassifier?