In [78]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics 
from sklearn.model_selection import RepeatedStratifiedKFold

## MONK 1

In [141]:
path=r'/home/ludovico/ML-project/data/monks-1'
monk1_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk1_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

y_train=monk1_train[0]
x_train=monk1_train[monk1_train.columns[1:-1]]

y_test=monk1_test[0]
x_test=monk1_test[monk1_train.columns[1:-1]]

x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)


In [142]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
n_neigh=np.arange(1,30,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'p':(1,2),
    'metric':('minkowski','cosine')
}      
# with GridSearch
grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    return_train_score = True
)

In [143]:
KNN_1=grid_search_KNN.fit(x_train, y_train)
print(grid_search_KNN.best_params_) 
print('Best Score - Validation:', grid_search_KNN.best_score_ )

y_pred_KNN1 =KNN_1.predict(x_test)

print('Accuracy Score - KNN - Test-error:', metrics.accuracy_score(y_test, y_pred_KNN1))  

{'metric': 'cosine', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
Best Score - Validation: 0.8079999999999999
Accuracy Score - KNN - Test-error: 0.8310185185185185


In [144]:
cv_results_df = pd.DataFrame(grid_search_KNN.cv_results_)

cv_results_df[[ 'params', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score,std_train_score
0,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.662033,0.083003,1.000000,0.000000
1,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.662033,0.083003,1.000000,0.000000
2,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.662033,0.083003,1.000000,0.000000
3,"{'metric': 'minkowski', 'n_neighbors': 1, 'p':...",0.662033,0.083003,1.000000,0.000000
4,"{'metric': 'minkowski', 'n_neighbors': 2, 'p':...",0.692867,0.062404,0.820560,0.022298
...,...,...,...,...,...
227,"{'metric': 'cosine', 'n_neighbors': 28, 'p': 2...",0.774900,0.087685,1.000000,0.000000
228,"{'metric': 'cosine', 'n_neighbors': 29, 'p': 1...",0.690267,0.094566,0.732644,0.036499
229,"{'metric': 'cosine', 'n_neighbors': 29, 'p': 1...",0.778100,0.093833,1.000000,0.000000
230,"{'metric': 'cosine', 'n_neighbors': 29, 'p': 2...",0.690267,0.094566,0.732644,0.036499


## MONK 3

In [200]:
path=r'/home/ludovico/ML-project/data/monks-3'
monk1_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk1_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

y_train=monk1_train[0]
x_train=monk1_train[monk1_train.columns[1:-1]]

y_test=monk1_test[0]
x_test=monk1_test[monk1_train.columns[1:-1]]

x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)


In [202]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
n_neigh=np.arange(20,50,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'p':(1,2),
    'metric':('minkowski','cosine','cityblock')
}      
# with GridSearch
grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    return_train_score = True
)

In [203]:
KNN_1=grid_search_KNN.fit(x_train, y_train)
print(grid_search_KNN.best_params_) 
print('Best Score - Validation:', grid_search_KNN.best_score_ )

y_pred_KNN1 =KNN_1.predict(x_test)

print('Accuracy Score - KNN - Test-error:', metrics.accuracy_score(y_test, y_pred_KNN1))  

{'metric': 'minkowski', 'n_neighbors': 42, 'p': 1, 'weights': 'distance'}
Best Score - Validation: 0.9098333333333334
Accuracy Score - KNN - Test-error: 0.9467592592592593


In [165]:
cv_results_df = pd.DataFrame(grid_search_KNN.cv_results_)

cv_results_df[[ 'params','mean_test_score', 'std_test_score',  'mean_train_score', 'std_train_score']]

Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score,std_train_score
0,"{'metric': 'minkowski', 'n_neighbors': 20, 'p'...",0.844733,0.075443,0.881778,0.027895
1,"{'metric': 'minkowski', 'n_neighbors': 20, 'p'...",0.885967,0.066039,1.000000,0.000000
2,"{'metric': 'minkowski', 'n_neighbors': 20, 'p'...",0.844733,0.075443,0.881778,0.027895
3,"{'metric': 'minkowski', 'n_neighbors': 20, 'p'...",0.873567,0.064513,1.000000,0.000000
4,"{'metric': 'minkowski', 'n_neighbors': 21, 'p'...",0.857100,0.067443,0.890783,0.024988
...,...,...,...,...,...
235,"{'metric': 'cosine', 'n_neighbors': 48, 'p': 2...",0.899167,0.053547,1.000000,0.000000
236,"{'metric': 'cosine', 'n_neighbors': 49, 'p': 1...",0.818767,0.070654,0.836080,0.035922
237,"{'metric': 'cosine', 'n_neighbors': 49, 'p': 1...",0.898433,0.051360,1.000000,0.000000
238,"{'metric': 'cosine', 'n_neighbors': 49, 'p': 2...",0.818767,0.070654,0.836080,0.035922


## MONK 2

In [207]:
path=r'/home/ludovico/ML-project/data/monks-2'
monk1_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk1_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

y_train=monk1_train[0]
x_train=monk1_train[monk1_train.columns[1:-1]]

y_test=monk1_test[0]
x_test=monk1_test[monk1_train.columns[1:-1]]

#x_train=pd.get_dummies(x_train)
#x_test=pd.get_dummies(x_test)


In [208]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
n_neigh=np.arange(1,50,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'metric':('cityblock','cosine')
}      
# with GridSearch
grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    return_train_score = True
)

In [209]:
KNN_1=grid_search_KNN.fit(x_train, y_train)
print(grid_search_KNN.best_params_) 
print('Best Score - Validation:', grid_search_KNN.best_score_ )

y_pred_KNN1 =KNN_1.predict(x_test)

print('Accuracy Score - KNN - Test-error:', metrics.accuracy_score(y_test, y_pred_KNN1))  

{'metric': 'cosine', 'n_neighbors': 1, 'weights': 'uniform'}
Best Score - Validation: 0.7953297682709448
Accuracy Score - KNN - Test-error: 0.8888888888888888


In [170]:
cv_results_df = pd.DataFrame(grid_search_KNN.cv_results_)

cv_results_df[[ 'params','mean_test_score', 'std_test_score',  'mean_train_score', 'std_train_score']]

Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score,std_train_score
0,"{'metric': 'cityblock', 'n_neighbors': 1, 'wei...",0.599572,0.065991,1.000000,0.000000
1,"{'metric': 'cityblock', 'n_neighbors': 1, 'wei...",0.599572,0.065991,1.000000,0.000000
2,"{'metric': 'cityblock', 'n_neighbors': 2, 'wei...",0.582852,0.063917,0.758743,0.017148
3,"{'metric': 'cityblock', 'n_neighbors': 2, 'wei...",0.588164,0.063404,1.000000,0.000000
4,"{'metric': 'cityblock', 'n_neighbors': 3, 'wei...",0.563476,0.074524,0.771306,0.023944
...,...,...,...,...,...
111,"{'metric': 'cosine', 'n_neighbors': 27, 'weigh...",0.628360,0.050607,1.000000,0.000000
112,"{'metric': 'cosine', 'n_neighbors': 28, 'weigh...",0.629679,0.040119,0.654144,0.018683
113,"{'metric': 'cosine', 'n_neighbors': 28, 'weigh...",0.625455,0.046437,1.000000,0.000000
114,"{'metric': 'cosine', 'n_neighbors': 29, 'weigh...",0.633137,0.051515,0.661537,0.024252
