In [1]:
from utils import * 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import itertools
from pprint import pprint
import joblib

import statistics

# Models
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, cross_val_score, RandomizedSearchCV 
from sklearn.model_selection import cross_val_predict

from sklearn.pipeline import Pipeline

from sklearn.metrics import make_scorer

#regression matrics
from sklearn.metrics import mean_absolute_error , mean_squared_error, r2_score

#classification metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, matthews_corrcoef

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.model_selection._split import check_cv

## Data 

In [2]:
train_labels = pd.read_csv('../data/train_test_sets/train_labels.csv', index_col = 'CASRN')
test_labels = pd.read_csv('../data/train_test_sets/test_labels.csv', index_col = 'CASRN')
train_labels.shape, test_labels.shape

((8221, 6), (2849, 6))

In [4]:
train_Hfeatures = pd.read_csv('../data/Hmodel_features_combined/train_Hfeatures.csv', index_col = 'CASRN')
train_Hfeatures.shape

(8221, 100)

In [5]:
train_Hfeatures.head(1)

Unnamed: 0_level_0,EPA_RF_ecfp6bits-1,EPA_RF_ecfp6bits-2,EPA_RF_ecfp6bits-3,EPA_RF_ecfp6counts-1,EPA_RF_ecfp6counts-2,EPA_RF_ecfp6counts-3,EPA_RF_maccs-1,EPA_RF_maccs-2,EPA_RF_maccs-3,EPA_RF_mordred-1,...,Toxic_svm_ecfp6bits-1,Toxic_svm_ecfp6counts-1,Toxic_svm_maccs-1,Toxic_svm_mordred-1,Toxic_svm_rdkit2d-1,Toxic_xgboost_ecfp6bits-1,Toxic_xgboost_ecfp6counts-1,Toxic_xgboost_maccs-1,Toxic_xgboost_mordred-1,Toxic_xgboost_rdkit2d-1
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.133902,0.240154,0.502604,0.125821,0.246357,0.522198,0.104243,0.165346,0.515884,0.081167,...,0.582366,0.682633,0.275429,0.816182,0.489093,0.668234,0.759473,0.36906,0.470704,0.472904


## Model Selection

We will model 3 endpoints with 4 algorithm (knn, xgboost, svm and rf)

In [6]:
# regression 
knn_reg = KNeighborsRegressor()
svr = SVR()
xgb_reg = XGBRegressor(random_state =123, n_jobs=6 ,objective ='reg:squarederror')
rf_reg =  RandomForestRegressor(random_state =123, n_jobs=6)


# classification 
svc = SVC(random_state =42)
knn_clf = KNeighborsClassifier()
xgb_clf = XGBClassifier(random_state =123, n_jobs=6)
rf_clf =  RandomForestClassifier(random_state =123, n_jobs=6)

### Search Space

In [16]:
knn_grid_parameters = {'n_neighbors': [5,9,15,19,25,35,45,55,71,101,151], 'weights': ['distance'],'p': [1,2]}

In [8]:
svm_grid_parameters = [
  {'C': [0.01, 0.1, 1, 10, 100, 200, 400, 1000], 'kernel': ['linear']},
  {'C': [0.01, 0.1, 1, 10, 100, 200, 400, 1000], 'gamma': [100,10,1,1e-1,1e-2, 1e-3], 'kernel': ['rbf']},
 ]

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 2)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 80, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 6]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
rf_grid_parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(rf_grid_parameters)

{'bootstrap': [True, False],
 'max_depth': [5, 20, 35, 50, 65, 80, None],
 'max_features': ['log2', 'sqrt'],
 'min_samples_leaf': [2, 4, 6],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [500, 1500]}


In [10]:
xgb_grid_parameters = {
    'learning_rate': [0.01,0.1],
    'max_depth': [3,6,10],
    'min_child_weight':[1,3,5], 
    'gamma':[0,1,5],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(5,11)], 
    'n_estimators': [500,1500]}

### Endpoint 1: Toxic

In [12]:
encoder_toxic = joblib.load('../data/label_encoders/encoder_toxic.joblib')

In [17]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'toxic', encoder = encoder_toxic)

H_t_knn = model_selection(knn_clf, knn_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 151, 'p': 1, 'weights': 'distance'}
Best score: 0.8798414771964566
Grid scores on development set:

0.843 (+/-0.042) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.845 (+/-0.038) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.860 (+/-0.043) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.862 (+/-0.043) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.870 (+/-0.046) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.869 (+/-0.042) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.871 (+/-0.047) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.872 (+/-0.042) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.874 (+/-0.046) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.874 (+/-0.044) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.876 (+/-0.047) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.876 (+/-0.046) for {'n_neighbors': 35, 'p': 2, '

In [18]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'toxic', encoder = encoder_toxic)

H_t_svm = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)



Best parameters set found on development set: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Best score: 0.8826931493143051
Grid scores on development set:

0.883 (+/-0.049) for {'C': 0.01, 'kernel': 'linear'}
0.880 (+/-0.049) for {'C': 0.1, 'kernel': 'linear'}
0.877 (+/-0.050) for {'C': 1, 'kernel': 'linear'}
0.876 (+/-0.050) for {'C': 10, 'kernel': 'linear'}
0.876 (+/-0.050) for {'C': 100, 'kernel': 'linear'}
0.876 (+/-0.050) for {'C': 200, 'kernel': 'linear'}
0.876 (+/-0.050) for {'C': 400, 'kernel': 'linear'}
0.876 (+/-0.050) for {'C': 1000, 'kernel': 'linear'}
0.510 (+/-0.007) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.764 (+/-0.028) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.828 (+/-0.042) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.864 (+/-0.049) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.865 (+/-0.052) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.853 (+/-0.055) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.511 (+/-0.007) for {'C': 0.1, 'gamma': 100, 'k

In [19]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'toxic', encoder = encoder_toxic)

H_t_rf = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
Best score: 0.8824387196376858
Grid scores on development set:

0.882 (+/-0.047) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.882 (+/-0.047) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
0.882 (+/-0.047) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
0.882 (+/-0.049) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
0.882 (+/-0.047) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
0.88

In [20]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'toxic', encoder = encoder_toxic)

H_t_xgb = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: 0.8833070866375617
Grid scores on development set:

0.880 (+/-0.045) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.867 (+/-0.044) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
0.870 (+/-0.042) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}
0.881 (+/-0.048) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.883 (+/-0.048) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning

### Endpoint 2: logLD50

In [21]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'logLD50_mmolkg', encoder = None)

H_ld50_rf = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
Best score: -0.3045348143883177
Grid scores on development set:

-0.305 (+/-0.056) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
-0.306 (+/-0.056) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': True}
-0.305 (+/-0.057) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
-0.305 (+/-0.057) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
-0.316 (+/-0.054) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': Tr

In [22]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'logLD50_mmolkg', encoder = None)

H_ld50_xgb = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
Best score: -0.30420437070099515
Grid scores on development set:

-0.325 (+/-0.059) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1.0}
-0.305 (+/-0.060) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
-0.321 (+/-0.062) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.7}
-0.312 (+/-0.062) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.9}
-0.309 (+/-0.060) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, '

In [23]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'logLD50_mmolkg', encoder = None)

H_ld50_svr = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: -0.3047374781566766
Grid scores on development set:

-0.309 (+/-0.052) for {'C': 0.01, 'kernel': 'linear'}
-0.309 (+/-0.054) for {'C': 0.1, 'kernel': 'linear'}
-0.311 (+/-0.053) for {'C': 1, 'kernel': 'linear'}
-0.311 (+/-0.053) for {'C': 10, 'kernel': 'linear'}
-0.312 (+/-0.053) for {'C': 100, 'kernel': 'linear'}
-0.312 (+/-0.053) for {'C': 200, 'kernel': 'linear'}
-0.312 (+/-0.053) for {'C': 400, 'kernel': 'linear'}
-0.312 (+/-0.053) for {'C': 1000, 'kernel': 'linear'}
-0.844 (+/-0.335) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
-0.844 (+/-0.335) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
-0.766 (+/-0.319) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
-0.382 (+/-0.111) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
-0.343 (+/-0.066) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
-0.468 (+/-0.125) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
-0.844 (+/-0.334) for {'C': 0.1, 'g

In [24]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'logLD50_mmolkg', encoder = None)

H_ld50_knn = model_selection(knn_reg, knn_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 45, 'p': 2, 'weights': 'distance'}
Best score: -0.31540912162767376
Grid scores on development set:

-0.351 (+/-0.052) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
-0.352 (+/-0.053) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
-0.332 (+/-0.055) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
-0.331 (+/-0.057) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
-0.323 (+/-0.051) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
-0.322 (+/-0.049) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
-0.320 (+/-0.048) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
-0.319 (+/-0.047) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
-0.319 (+/-0.047) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-0.317 (+/-0.047) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
-0.319 (+/-0.048) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
-0.316 (+/-0.048) for {'n_neighbors': 

### Endpoint 3: EPA Category

In [25]:
encoder_epa = joblib.load('../data/label_encoders/encoder_epa.joblib')

In [29]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'EPA_category', encoder = encoder_epa)

H_epa_rf = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: 0.6416518706215465
Grid scores on development set:

0.630 (+/-0.073) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False}
0.632 (+/-0.071) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
0.638 (+/-0.056) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
0.630 (+/-0.069) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.639 (+/-0.062) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}


In [26]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'EPA_category', encoder = encoder_epa)

H_epa_xgb = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
Best score: 0.6402031489213905
Grid scores on development set:

0.640 (+/-0.061) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.637 (+/-0.070) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
0.636 (+/-0.060) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.635 (+/-0.065) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
0.633 (+/-0.066) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'lea

In [27]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'EPA_category', encoder = encoder_epa)

H_epa_svc = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'C': 0.1, 'kernel': 'linear'}
Best score: 0.6366080337823246
Grid scores on development set:

0.632 (+/-0.072) for {'C': 0.01, 'kernel': 'linear'}
0.637 (+/-0.067) for {'C': 0.1, 'kernel': 'linear'}
0.630 (+/-0.068) for {'C': 1, 'kernel': 'linear'}
0.630 (+/-0.071) for {'C': 10, 'kernel': 'linear'}
0.631 (+/-0.071) for {'C': 100, 'kernel': 'linear'}
0.631 (+/-0.071) for {'C': 200, 'kernel': 'linear'}
0.630 (+/-0.072) for {'C': 400, 'kernel': 'linear'}
0.629 (+/-0.070) for {'C': 1000, 'kernel': 'linear'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.531 (+/-0.061) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.425 (+/-0.018) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.1, 'gamma': 100, 'kernel': 'rbf

In [28]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_Hfeatures, target = 'EPA_category', encoder = encoder_epa)

H_epa_knn = model_selection(knn_clf, knn_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
Best score: 0.6316034266269072
Grid scores on development set:

0.593 (+/-0.056) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.591 (+/-0.041) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.610 (+/-0.049) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.615 (+/-0.046) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.623 (+/-0.054) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.623 (+/-0.059) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.625 (+/-0.057) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.623 (+/-0.055) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.627 (+/-0.066) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.625 (+/-0.059) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.632 (+/-0.058) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.626 (+/-0.063) for {'n_neighbors': 35, 'p': 2, 'w