In [1]:
# Import packages
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import neighbors
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load and preprocess data

In [2]:
# Load datasets
df = pd.read_excel('Dokumenter\ChildrenWristFeaturesV2_TSF_new.xlsx')
df.fillna(df.mean(), inplace=True)

In [3]:
# Initialize features and targets/labels
X = df.iloc[:, 1:32]
Y = df.iloc[:, 32]

# Splits datasets into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Data preprocessing, scaling datasets/standardization
scal_data = StandardScaler()
X_train = scal_data.fit_transform(X_train)
X_test = scal_data.transform(X_test)

# Hyperparameter tuning

In [18]:
# Try different range of values
# Number of neighbors
#n_neighbors = [2, 5, 7, 10, 15]
# Number of features to consider at every split
weights = ['uniform', 'distance']
# Maximum depth
algorithm = ['ball_tree', 'kd_tree', 'brute', 'auto']
# Leaf size passed to BallTree or KDTree
leaf_size = [1, 5, 10, 15]
# Power parameter for the Minkowski metric
p = [1, 2]

# Create dictionary to store the hyperparameters
param_grid = { 'weights': weights,
               'algorithm': algorithm,
               'leaf_size': leaf_size,
               'p': p}

# Base model
knn = neighbors.KNeighborsClassifier()

# First run

In [10]:
# Use the GridSearchCV function to cross validate and find the best values for each hyperparameter
knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=2)
knn_grid.fit(X_train, y_train)
print(knn_grid.best_params_)
print(knn_grid.best_score_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] algorithm=ball_tree, n_neighbors=2, weights=uniform .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  algorithm=ball_tree, n_neighbors=2, weights=uniform, total= 1.6min
[CV] algorithm=ball_tree, n_neighbors=2, weights=uniform .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.6min remaining:    0.0s


[CV]  algorithm=ball_tree, n_neighbors=2, weights=uniform, total= 1.4min
[CV] algorithm=ball_tree, n_neighbors=2, weights=uniform .............
[CV]  algorithm=ball_tree, n_neighbors=2, weights=uniform, total= 1.4min
[CV] algorithm=ball_tree, n_neighbors=2, weights=uniform .............
[CV]  algorithm=ball_tree, n_neighbors=2, weights=uniform, total= 1.4min
[CV] algorithm=ball_tree, n_neighbors=2, weights=uniform .............
[CV]  algorithm=ball_tree, n_neighbors=2, weights=uniform, total= 1.3min
[CV] algorithm=ball_tree, n_neighbors=2, weights=distance ............
[CV]  algorithm=ball_tree, n_neighbors=2, weights=distance, total= 1.3min
[CV] algorithm=ball_tree, n_neighbors=2, weights=distance ............
[CV]  algorithm=ball_tree, n_neighbors=2, weights=distance, total= 1.3min
[CV] algorithm=ball_tree, n_neighbors=2, weights=distance ............
[CV]  algorithm=ball_tree, n_neighbors=2, weights=distance, total= 1.4min
[CV] algorithm=ball_tree, n_neighbors=2, weights=distance ..

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 158.9min finished


NameError: name 'dt_grid' is not defined

In [11]:
print(knn_grid.best_params_)
print(knn_grid.best_score_)

{'algorithm': 'ball_tree', 'n_neighbors': 5, 'weights': 'distance'}
0.6865772550332768


# Second run

In [8]:
# Use the GridSearchCV function to cross validate and find the best values for each hyperparameter
knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=2)
knn_grid.fit(X_train, y_train)
print(knn_grid.best_params_)
print(knn_grid.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance, total=  23.6s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.5s remaining:    0.0s


[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance, total=  23.2s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance 
[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance, total=  23.6s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance 
[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance, total=  23.2s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance 
[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=1, weights=distance, total=  22.7s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=2, weights=distance 
[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=2, weights=distance, total=  37.0s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=2, weights=distance 
[CV]  algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=2, weights=distance, total=  35.9s
[CV] algorithm=ball_tree, leaf_size=1, n_neighbors=2, p=2, weig

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 124.6min finished


{'algorithm': 'ball_tree', 'leaf_size': 1, 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.7316185012160491


# Third run

In [19]:
# Use the GridSearchCV function to cross validate and find the best values for each hyperparameter
knn_grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, verbose=2)
knn_grid.fit(X_train, y_train)
print(knn_grid.best_params_)
print(knn_grid.best_score_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=uniform ..........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=uniform, total=  30.2s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=uniform ..........


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   30.1s remaining:    0.0s


[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=uniform, total=  28.9s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=uniform ..........
[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=uniform, total=  28.3s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=uniform ..........
[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=uniform, total=  28.0s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=uniform ..........
[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=uniform, total=  29.1s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=distance .........
[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=distance, total=  26.9s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=distance .........
[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=distance, total=  26.9s
[CV] algorithm=ball_tree, leaf_size=1, p=1, weights=distance .........
[CV]  algorithm=ball_tree, leaf_size=1, p=1, weights=distance, total=  27.5s
[CV] algorithm=ball_tree, leaf_size=1, 

[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed: 233.4min finished


{'algorithm': 'ball_tree', 'leaf_size': 1, 'p': 1, 'weights': 'distance'}
0.7316185012160491


# Evaluate model

# Third run

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
pred_train = knn_grid.predict(X_train)
pred_test = knn_grid.predict(X_test)
#print(accuracy_score(y_test, y_pred))
print('Train accuracy:', knn_grid.score(X_train, y_train))
print('Test accuracy:', knn_grid.score(X_test, y_test))
print('Train ROC_AUC:', roc_auc_score(y_train, knn_grid.predict_proba(X_train), average='macro', multi_class='ovr'))
print('Test ROC_AUC:', roc_auc_score(y_test, knn_grid.predict_proba(X_test), average='macro', multi_class='ovr'))

Train accuracy: 1.0
Test accuracy: 0.7416120888368519
Train ROC_AUC: 1.0
Test ROC_AUC: 0.9461147509719237


In [8]:
from sklearn.multiclass import OneVsRestClassifier
knn = neighbors.KNeighborsClassifier(algorithm='ball_tree', leaf_size=1, p=1, weights='distance')
knn_clf = OneVsRestClassifier(knn)
knn_clf.fit(X_train, y_train)
y_pred_train = knn_clf.predict(X_train)
y_pred_test = knn_clf.predict(X_test)

In [9]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      6750
           2       1.00      1.00      1.00      6755
           3       1.00      1.00      1.00      6736
           4       1.00      1.00      1.00      6722
           5       1.00      1.00      1.00      6749
           6       1.00      1.00      1.00      6712
           7       1.00      1.00      1.00      6721
           8       1.00      1.00      1.00      6724
           9       1.00      1.00      1.00      4213
          10       1.00      1.00      1.00      3042
          12       1.00      1.00      1.00     48198

    accuracy                           1.00    109322
   macro avg       1.00      1.00      1.00    109322
weighted avg       1.00      1.00      1.00    109322

              precision    recall  f1-score   support

           1       0.78      0.77      0.77      1650
           2       0.76      0.77      0.77      1645
           3       0.77 

In [10]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred_test))

[[1268   62   10    1    1    0    0    0    0   17  291]
 [  47 1274   47    1    0    0    0    1    0    4  271]
 [  10   37 1261    4    0    8    2    2    1    2  337]
 [   0    0    0 1281  120    6   10    5    1    0  255]
 [   0    0    0  112 1192   93   17    2    8    0  227]
 [   3    0    6   23   71 1261   35   11   20    0  258]
 [   0    0    0   18   27   44  986   13  132    2  457]
 [   0    0    1    7    4    7   15 1270   11    0  361]
 [   0    0    1   12   30   64  144    6  539    1  270]
 [  40   19    9    0    0    0    2    1    1  485  194]
 [ 262  284  305  364  324  261  283  409  144   94 9452]]
