In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt   # use matplotlib for plotting with inline plots

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import graphviz
from sklearn import tree
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin'

from sklearn.metrics import confusion_matrix
import seaborn as sns

np.random.seed(0)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') # for deprecated matplotlib functions

In [5]:
# load the data
data = pd.read_csv('data/heart.csv')
X = data.drop(['target'], axis=1)
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# K-Nearest Neighbor

In [6]:
knn = KNeighborsClassifier(n_neighbors=15, weights='uniform', 
                           algorithm='auto', leaf_size=30, p=1, 
                           metric='minkowski',metric_params=None,
                          n_jobs=-1)

knn.fit(X_train, y_train)
# fit the model using X as training data and Y as target values

y_validation_hat = knn.predict_proba(X_test)[:,1]

# roc_auc_score(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)
knn_classifier_roc = roc_auc_score(y_test, y_validation_hat, average='macro', sample_weight=None)
print("roc auc:", knn_classifier_roc)
print("train_score:", knn.score(X_train, y_train))
print("test_score:", knn.score(X_test, y_test))

print("training error:", 1 - knn.score(X_train, y_train))
print("validation error:", 1 - knn.score(X_test, y_test))
# returns the mean accuracy on the given test data and labels

roc auc: 0.8685344827586206
train_score: 0.6859504132231405
test_score: 0.7704918032786885
training error: 0.31404958677685946
validation error: 0.2295081967213115


In [7]:
# Hyperparameter tuning
knn_clf = KNeighborsClassifier()
knn_param_grid = {'n_neighbors': [5, 10, 15, 20],
                'weights': ['uniform', 'distance'],
                 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                 'leaf_size': [30, 40, 50, 60]}

knn_search = GridSearchCV(knn_clf, param_grid=knn_param_grid, cv=3, scoring='accuracy', verbose=20, n_jobs=-1)
#rfc = RandomizedSearchCV(rf, param_distributions = rfc_param_grid, n_iter = 100, cv = 3, scoring='accuracy', verbose=20, random_state=42, n_jobs = -1)
knn_search.fit(X_train, y_train)
best_knn = knn_search.best_estimator_
print(best_knn)
print(knn_search.best_score_)
print(knn_search.best_params_)

Fitting 3 folds for each of 128 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
0.6487603305785123
{'algorithm': 'auto', 'leaf_size': 30, 'n_neighbors': 15, 'weights': 'uniform'}


[Parallel(n_jobs=-1)]: Done 369 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 371 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 372 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 373 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 374 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 375 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:    5.0s finished
