In [1]:
import pandas as pd
import numpy as np
import math 
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 1. loads the data file;
wine_data = pd.read_csv('winequality-white.csv',delimiter=";", nrows =800)  
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# 2. construct a new binary column “good wine” that indicates whether the wine is good
wine_data['Good wine'] = wine_data['quality'].apply(lambda x: 11 if x >= 7 else 0)
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Good wine
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,0


In [4]:
# 3. splits the data set into a training data set (first 400 samples), a validation data set (next 200 samples) and a test data set (last 200 samples)
training_data, validation_data, test_data = wine_data[:400], wine_data[400:600], wine_data[600:]

In [5]:
# 4. normalises the training data according to the Z-score transform
def normalises_the_data_z_score(input_data):
    data_normalised = preprocessing.scale(input_data)
    data_normalised = pd.DataFrame(data=data_normalised,columns = list(input_data.columns))
    data_normalised['Good wine'] = input_data['Good wine']
    return data_normalised
training_data_normalised = normalises_the_data_z_score(training_data)
validation_data_normalised = normalises_the_data_z_score(validation_data)
test_data_normalised = normalises_the_data_z_score(test_data)

In [6]:
# 5. loads and trains the k-Nearest Neighbours classifiers for k = 1,2, …,100
training_data_normalised_x = training_data_normalised.copy().drop(columns=['quality', 'Good wine']).values
training_data_normalised_y = training_data['Good wine']

validation_data_normalised_x = validation_data_normalised.copy().drop(columns=['quality', 'Good wine']).values
validation_data_normalised_y = validation_data['Good wine']
scores_list = []
for k in range(1,101):
   classifier = KNeighborsClassifier(n_neighbors=k)
   classifier.fit(training_data_normalised_x, training_data_normalised_y)
   validation_data_normalised_y_predicted = classifier.predict(validation_data_normalised_x)
   scores_list.append(1-metrics.accuracy_score(validation_data_normalised_y,validation_data_normalised_y_predicted,normalize=True))
scores_list_df = pd.DataFrame(data=scores_list).rename(columns={0: "Classificaion error(validation data)"})
scores_list_df['k-value'] = list(range(1,101))

In [7]:
scores_list_df

Unnamed: 0,Classificaion error(validation data),k-value
0,0.245,1
1,0.205,2
2,0.215,3
3,0.210,4
4,0.260,5
...,...,...
95,0.210,96
96,0.210,97
97,0.210,98
98,0.205,99


In [8]:
# 6. evaluates each classifier using the validation data set and selects the best classifier
best_k_value=scores_list_df['Classificaion error(validation data)'].idxmin()+1
best_k_value

53

In [9]:
# 7. predicts the generalisation error using the test data set.
test_data_normalised_x = test_data_normalised.copy().drop(columns=['quality', 'Good wine']).values
test_data_normalised_y = test_data['Good wine']

test_data_normalised_y_predicted = classifier.predict(test_data_normalised_x)
testing_data_error_ratio= 1-metrics.accuracy_score(test_data_normalised_y,test_data_normalised_y_predicted,normalize=True)


testing_data_error_ratio

0.20999999999999996

In [13]:
# 8. 
#split the data set
new_training_data, new_validation_data, new_test_data = wine_data[:200], wine_data[200:400], wine_data[400:]

#normalises the training data according to the Z-score transform
new_training_data_normalised = normalises_the_data_z_score(new_training_data)
new_validation_data_normalised = normalises_the_data_z_score(new_validation_data)
new_test_data_normalised = normalises_the_data_z_score(new_test_data)

#loads and trains the k-Nearest Neighbours classifiers for k = 1,2, …,100
new_training_data_normalised_x = new_training_data_normalised.copy().drop(columns=['quality', 'Good wine']).values
new_training_data_normalised_y = new_training_data['Good wine']

new_validation_data_normalised_x = new_validation_data_normalised.copy().drop(columns=['quality', 'Good wine']).values
new_validation_data_normalised_y = new_validation_data['Good wine']

new_scores_list = []

In [14]:
for k in range(1,101):
   new_classifier = KNeighborsClassifier(n_neighbors=k)
   new_classifier.fit(new_training_data_normalised_x, new_training_data_normalised_y)
   new_validation_data_normalised_y_predicted = classifier.predict(new_validation_data_normalised_x)
   new_scores_list.append(1-metrics.accuracy_score(new_validation_data_normalised_y,new_validation_data_normalised_y_predicted,normalize=True))
new_scores_list_df = pd.DataFrame(data=new_scores_list).rename(columns={0: "New Classificaion error(validation data)"})
new_scores_list_df['k-value'] = list(range(1,101))

In [15]:
new_scores_list_df

Unnamed: 0,New Classificaion error(validation data),k-value
0,0.21,1
1,0.21,2
2,0.21,3
3,0.21,4
4,0.21,5
...,...,...
95,0.21,96
96,0.21,97
97,0.21,98
98,0.21,99


In [16]:
#evaluates each classifier using the validation data set and selects the best classifier
new_best_k_value=new_scores_list_df['New Classificaion error(validation data)'].idxmin()+1
new_best_k_value

1

In [17]:
#predicts the generalisation error using the test data set
new_test_data_normalised_x = new_test_data_normalised.copy().drop(columns=['quality', 'Good wine']).values
new_test_data_normalised_y = new_test_data['Good wine']

new_test_data_normalised_y_predicted = new_classifier.predict(new_test_data_normalised_x)
new_testing_data_error_ratio= 1-metrics.accuracy_score(new_test_data_normalised_y,new_test_data_normalised_y_predicted,normalize=True)


new_testing_data_error_ratio

0.21250000000000002

#### The new generalisation error increases a little from 0.2099 to 0.2125.