## Импорт библиотек

Загружаем все необходимые библиотеки для обработки данных, визуализации, построения моделей и оценки качества. Включены библиотеки `pandas`, `numpy`, `matplotlib`, а также модели из `scikit-learn` и `xgboost`.


In [None]:
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import itertools
from pprint import pprint
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, cross_val_score, RandomizedSearchCV 
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error , mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.model_selection._split import check_cv
from utils import *

## Загрузка целевых меток

Загружаем файлы с целевыми переменными для обучающей и тестовой выборок. Идентификатором является `CASRN`.


In [None]:
train_labels = pd.read_csv('../data/processed/train_labels.csv', index_col = 'CASRN')

train_ecfp6_bits = pd.read_csv('../data/Bmodel_features/modeling_train_ecfp6_bits.csv', index_col='CASRN')
train_ecfp6_counts = pd.read_csv('../data/Bmodel_features/modeling_train_ecfp6_counts.csv', index_col='CASRN')
train_maccs = pd.read_csv('../data/Bmodel_features/modeling_train_maccs.csv', index_col='CASRN')
train_rdkit2d = pd.read_csv('../data/Bmodel_features/modeling_train_rdkit2d.csv', index_col='CASRN')
train_mordred = pd.read_csv('../data/Bmodel_features/modeling_train_mordred.csv', index_col='CASRN')

Создаём экземпляры моделей

In [None]:
knn_reg = KNeighborsRegressor()
svr = SVR()
xgb_reg = XGBRegressor(random_state =123, n_jobs=6 ,objective ='reg:squarederror')
rf_reg =  RandomForestRegressor(random_state =123, n_jobs=6)

svc = SVC(random_state =42)
knn_clf = KNeighborsClassifier()
xgb_clf = XGBClassifier(random_state =123, n_jobs=6)
rf_clf =  RandomForestClassifier(random_state =123, n_jobs=6)

Задаём выборку из гиперпараметров для моделей

In [7]:
knn_grid_parameters_des = {'n_neighbors': [5,9,15,19,25,35,45,55,71], 'weights': ['distance'],'p': [1,2]}
knn_grid_parameters_fp = {'n_neighbors': [5,9,15,19,23,25,35,45,55,71], 'weights': ['distance'],
                          'metric': ['jaccard', 'dice', 'rogerstanimoto']}
knn_grid_parameters_fpcounts = {'n_neighbors': [5,9,15,19,25,35,45,55,71], 'weights': ['distance'],
                          'metric': ['hamming', 'canberra', 'braycurtis']}


In [8]:
svm_grid_parameters = [
  {'C': [0.01, 0.1, 1, 10, 100, 200, 400, 1000], 'kernel': ['linear']},
  {'C': [0.01, 0.1, 1, 10, 100, 200, 400, 1000], 'gamma': [100,10,1,1e-1,1e-2, 1e-3], 'kernel': ['rbf']},
 ]


In [9]:
svm_grid_parameters = { 'gamma': [10,1,1e-1,1e-2],
                     'C': [1, 10, 100]}

In [10]:
svm_grid_parameters_1 = { 'gamma': [1,1e-1],
                     'C': [1, 10]}


In [11]:
svm_grid_parameters_2 = [
  {'C': [1, 10], 'kernel': ['linear']},
  {'C': [1, 10], 'gamma': [1,1e-1], 'kernel': ['rbf']},
 ]

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 2)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(5, 80, num = 6)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [2, 4, 6]
bootstrap = [True, False]

rf_grid_parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(rf_grid_parameters)

{'bootstrap': [True, False],
 'max_depth': [5, 20, 35, 50, 65, 80, None],
 'max_features': ['log2', 'sqrt'],
 'min_samples_leaf': [2, 4, 6],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [500, 1500]}


In [14]:
xgb_grid_parameters = {
    'learning_rate': [0.01,0.1],
    'max_depth': [3,6,10],
    'min_child_weight':[1,3,5], 
    'gamma':[0,1,5],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(5,11)], 
    'n_estimators': [500,1500]}

Напишем функцию для структурированного вывода полученных результатов

In [None]:
def result_model_selection(results, name):
    df_results = pd.DataFrame({'model': [name] * len(results.cv_results_['params']),
                               'params': results.cv_results_['params'],
                              'mean score': results.cv_results_['mean_test_score'],
                              'std score': results.cv_results_['std_test_score'],
                               'rank': results.cv_results_['rank_test_score']
                              })
    return df_results

## Проводим анализ

kNN
Endpoint 1: VeryToxic

In [59]:
encoder_verytoxic = joblib.load('../encoder_models/encoder_verytoxic.joblib')


In [60]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_ecfp6bits = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 25, 'weights': 'distance'}
Best score: 0.8333423642625253
Grid scores on development set:

0.794 (+/-0.106) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.814 (+/-0.107) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.826 (+/-0.113) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.829 (+/-0.113) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.832 (+/-0.119) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.832 (+/-0.119) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.828 (+/-0.126) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.828 (+/-0.126) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.828 (+/-0.125) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.824 (+/-0.134) for {'metric': 'jaccard', 'n_neighbors': 71, 'wei

In [61]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_maccs = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 35, 'weights': 'distance'}
Best score: 0.8353930162092229
Grid scores on development set:

0.799 (+/-0.120) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.819 (+/-0.106) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.829 (+/-0.108) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.832 (+/-0.104) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.834 (+/-0.102) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.833 (+/-0.103) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.834 (+/-0.097) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.830 (+/-0.105) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.830 (+/-0.105) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.825 (+/-0.110) for {'metric': 'jaccard', 'n_neighbors': 71, 'wei

In [62]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_ecfp6count = model_selection(knn_clf, knn_grid_parameters_fpcounts, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 19, 'weights': 'distance'}
Best score: 0.8283099373483698
Grid scores on development set:

0.742 (+/-0.120) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
0.762 (+/-0.126) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
0.769 (+/-0.122) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
0.774 (+/-0.121) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
0.775 (+/-0.121) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
0.781 (+/-0.113) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
0.783 (+/-0.125) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
0.779 (+/-0.130) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
0.780 (+/-0.132) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
0.749 (+/-0.116) for {'metric': 'canberra', 'n_neighbors': 5

In [63]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_rdkit2d = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)


Best parameters set found on development set: {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
Best score: 0.8246999522990495
Grid scores on development set:

0.788 (+/-0.110) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.773 (+/-0.109) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.808 (+/-0.116) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.789 (+/-0.115) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.820 (+/-0.121) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.799 (+/-0.132) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.824 (+/-0.115) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.809 (+/-0.132) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.824 (+/-0.124) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.810 (+/-0.131) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.825 (+/-0.122) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.808 (+/-0.134) for {'n_neighbors': 35, 'p': 2, 'w

In [64]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_mordred = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 45, 'p': 1, 'weights': 'distance'}
Best score: 0.8251542029500607
Grid scores on development set:

0.809 (+/-0.107) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.797 (+/-0.115) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.824 (+/-0.107) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.808 (+/-0.120) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.823 (+/-0.125) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.821 (+/-0.123) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.825 (+/-0.127) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.822 (+/-0.124) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.825 (+/-0.125) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.819 (+/-0.126) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.824 (+/-0.127) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.818 (+/-0.129) for {'n_neighbors': 35, 'p': 2, 'w

Endpoint 2: toxic

In [65]:
encoder_toxic = joblib.load('../encoder_models/encoder_toxic.joblib')

In [66]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_knn_ecfp6bits = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 19, 'weights': 'distance'}
Best score: 0.7765003505028532
Grid scores on development set:

0.761 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.770 (+/-0.056) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.774 (+/-0.051) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.775 (+/-0.054) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.773 (+/-0.059) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.772 (+/-0.061) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.767 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.764 (+/-0.067) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.761 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.756 (+/-0.064) for {'metric': 'jaccard', 'n_neighbors': 71, 'wei

In [67]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_knn_maccs = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'rogerstanimoto', 'n_neighbors': 19, 'weights': 'distance'}
Best score: 0.8005131593622032
Grid scores on development set:

0.790 (+/-0.083) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.798 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.798 (+/-0.085) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.797 (+/-0.087) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.796 (+/-0.085) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.796 (+/-0.085) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.792 (+/-0.085) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.790 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.787 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.784 (+/-0.078) for {'metric': 'jaccard', 'n_neighbors'

In [68]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_knn_ecfp6count = model_selection(knn_clf, knn_grid_parameters_fpcounts, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 15, 'weights': 'distance'}
Best score: 0.7758825346243101
Grid scores on development set:

0.728 (+/-0.062) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
0.730 (+/-0.062) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
0.733 (+/-0.049) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
0.735 (+/-0.045) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
0.731 (+/-0.041) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
0.725 (+/-0.041) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
0.728 (+/-0.039) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
0.724 (+/-0.040) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
0.718 (+/-0.038) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
0.729 (+/-0.061) for {'metric': 'canberra', 'n_neighbors': 5

In [69]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_knn_rdkit2d = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
Best score: 0.8032485798583557
Grid scores on development set:

0.794 (+/-0.075) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.778 (+/-0.077) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.802 (+/-0.074) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.782 (+/-0.081) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.803 (+/-0.068) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.781 (+/-0.076) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.801 (+/-0.065) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.778 (+/-0.074) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.798 (+/-0.065) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.776 (+/-0.067) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.792 (+/-0.067) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.771 (+/-0.062) for {'n_neighbors': 35, 'p': 2, 'w

In [70]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_knn_mordred = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
Best score: 0.8050440160843321
Grid scores on development set:

0.794 (+/-0.074) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.783 (+/-0.076) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.803 (+/-0.072) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.792 (+/-0.083) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.805 (+/-0.070) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.789 (+/-0.081) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.802 (+/-0.074) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.788 (+/-0.079) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.799 (+/-0.071) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.788 (+/-0.074) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.797 (+/-0.068) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.784 (+/-0.073) for {'n_neighbors': 35, 'p': 2, 'w

Endpoint 3: EPA

In [71]:
encoder_epa = joblib.load('../encoder_models/encoder_epa.joblib')


In [72]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_knn_ecfp6bits = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)



Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 9, 'weights': 'distance'}
Best score: 0.5280720565764737
Grid scores on development set:

0.525 (+/-0.028) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.527 (+/-0.032) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.524 (+/-0.045) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.514 (+/-0.058) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.511 (+/-0.058) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.507 (+/-0.063) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.495 (+/-0.067) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.481 (+/-0.061) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.471 (+/-0.059) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.457 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 71, 'weig

In [73]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_knn_maccs = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'rogerstanimoto', 'n_neighbors': 15, 'weights': 'distance'}
Best score: 0.5400022074386337
Grid scores on development set:

0.531 (+/-0.049) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.535 (+/-0.052) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.533 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.533 (+/-0.072) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.530 (+/-0.070) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.531 (+/-0.070) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.521 (+/-0.080) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.515 (+/-0.084) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.509 (+/-0.070) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.500 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors'

In [74]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_knn_ecfp6count = model_selection(knn_clf, knn_grid_parameters_fpcounts, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 9, 'weights': 'distance'}
Best score: 0.5286790743951763
Grid scores on development set:

0.489 (+/-0.024) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
0.492 (+/-0.034) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
0.479 (+/-0.030) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
0.472 (+/-0.026) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
0.466 (+/-0.041) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
0.448 (+/-0.033) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
0.433 (+/-0.042) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
0.418 (+/-0.042) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
0.407 (+/-0.041) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
0.489 (+/-0.031) for {'metric': 'canberra', 'n_neighbors': 5,

In [75]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_knn_rdkit2d = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)


Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: 0.5394396679306864
Grid scores on development set:

0.535 (+/-0.044) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.518 (+/-0.039) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.539 (+/-0.057) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.519 (+/-0.048) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.532 (+/-0.067) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.515 (+/-0.060) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.527 (+/-0.072) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.507 (+/-0.063) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.520 (+/-0.075) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.500 (+/-0.071) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.509 (+/-0.081) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.492 (+/-0.077) for {'n_neighbors': 35, 'p': 2, 'we

In [76]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_knn_mordred = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)


Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: 0.5397530479270659
Grid scores on development set:

0.535 (+/-0.043) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.529 (+/-0.050) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.540 (+/-0.038) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.531 (+/-0.054) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.535 (+/-0.055) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.528 (+/-0.062) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.530 (+/-0.060) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.520 (+/-0.057) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.524 (+/-0.069) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.512 (+/-0.062) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.512 (+/-0.062) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.499 (+/-0.064) for {'n_neighbors': 35, 'p': 2, 'we

Endpoint 4: logLD50

In [77]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_ecfp6bits = model_selection(knn_reg, knn_grid_parameters_fp, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 15, 'weights': 'distance'}
Best score: -0.5332636774453705
Grid scores on development set:

-0.554 (+/-0.148) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
-0.543 (+/-0.169) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
-0.538 (+/-0.175) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
-0.541 (+/-0.173) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
-0.546 (+/-0.171) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
-0.548 (+/-0.171) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
-0.559 (+/-0.168) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
-0.566 (+/-0.167) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
-0.575 (+/-0.172) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
-0.589 (+/-0.182) for {'metric': 'jaccard', 'n_neighbors

In [78]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_maccs = model_selection(knn_reg, knn_grid_parameters_fp, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)



Best parameters set found on development set: {'metric': 'rogerstanimoto', 'n_neighbors': 9, 'weights': 'distance'}
Best score: -0.489269050087817
Grid scores on development set:

-0.504 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
-0.500 (+/-0.106) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
-0.503 (+/-0.134) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
-0.509 (+/-0.143) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
-0.516 (+/-0.147) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
-0.519 (+/-0.148) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
-0.532 (+/-0.152) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
-0.543 (+/-0.155) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
-0.552 (+/-0.156) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
-0.561 (+/-0.157) for {'metric': 'jaccard', 'n_n

In [79]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_ecfp6count = model_selection(knn_reg, knn_grid_parameters_fpcounts, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 9, 'weights': 'distance'}
Best score: -0.5249314216982528
Grid scores on development set:

-0.682 (+/-0.228) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
-0.696 (+/-0.273) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
-0.719 (+/-0.295) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
-0.732 (+/-0.296) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
-0.749 (+/-0.309) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
-0.770 (+/-0.325) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
-0.787 (+/-0.340) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
-0.799 (+/-0.346) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
-0.817 (+/-0.355) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
-0.679 (+/-0.229) for {'metric': 'canberra', 'n_nei

In [80]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_rdkit2d = model_selection(knn_reg, knn_grid_parameters_des, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: -0.49686610339448434
Grid scores on development set:

-0.509 (+/-0.124) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
-0.550 (+/-0.160) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
-0.497 (+/-0.126) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
-0.538 (+/-0.151) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
-0.504 (+/-0.138) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
-0.539 (+/-0.168) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
-0.511 (+/-0.140) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
-0.544 (+/-0.159) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
-0.520 (+/-0.148) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-0.554 (+/-0.164) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
-0.535 (+/-0.148) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
-0.565 (+/-0.164) for {'n_neighbors': 3

In [81]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_mordred = model_selection(knn_reg, knn_grid_parameters_des, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: -0.47073453589854897
Grid scores on development set:

-0.475 (+/-0.083) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
-0.495 (+/-0.092) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
-0.471 (+/-0.122) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
-0.491 (+/-0.111) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
-0.480 (+/-0.141) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
-0.496 (+/-0.129) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
-0.487 (+/-0.139) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
-0.503 (+/-0.131) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
-0.495 (+/-0.136) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-0.508 (+/-0.130) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
-0.509 (+/-0.135) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
-0.520 (+/-0.128) for {'n_neighbors': 3

SVM
Endpoint 1: verytoxic
Endpoint 1: toxic

In [82]:
encoder_toxic = joblib.load('../encoder_models/encoder_toxic.joblib')

In [83]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_svc_ecfp6bits = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=6)


Best parameters set found on development set: {'C': 1, 'gamma': 0.01}
Best score: 0.7615940094038895
Grid scores on development set:

0.501 (+/-0.002) for {'C': 1, 'gamma': 10}
0.677 (+/-0.065) for {'C': 1, 'gamma': 1}
0.761 (+/-0.081) for {'C': 1, 'gamma': 0.1}
0.762 (+/-0.067) for {'C': 1, 'gamma': 0.01}
0.501 (+/-0.002) for {'C': 10, 'gamma': 10}
0.678 (+/-0.067) for {'C': 10, 'gamma': 1}
0.759 (+/-0.082) for {'C': 10, 'gamma': 0.1}
0.747 (+/-0.051) for {'C': 10, 'gamma': 0.01}
0.501 (+/-0.002) for {'C': 100, 'gamma': 10}
0.678 (+/-0.067) for {'C': 100, 'gamma': 1}
0.758 (+/-0.082) for {'C': 100, 'gamma': 0.1}
0.717 (+/-0.056) for {'C': 100, 'gamma': 0.01}
CPU times: total: 1min 30s
Wall time: 20min 53s


In [84]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_svc_maccs = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=1)


Best parameters set found on development set: {'C': 1, 'gamma': 0.1}
Best score: 0.8142373488655791
Grid scores on development set:

0.614 (+/-0.073) for {'C': 1, 'gamma': 10}
0.750 (+/-0.108) for {'C': 1, 'gamma': 1}
0.814 (+/-0.067) for {'C': 1, 'gamma': 0.1}
0.785 (+/-0.065) for {'C': 1, 'gamma': 0.01}
0.614 (+/-0.073) for {'C': 10, 'gamma': 10}
0.748 (+/-0.107) for {'C': 10, 'gamma': 1}
0.793 (+/-0.081) for {'C': 10, 'gamma': 0.1}
0.801 (+/-0.068) for {'C': 10, 'gamma': 0.01}
0.614 (+/-0.073) for {'C': 100, 'gamma': 10}
0.748 (+/-0.107) for {'C': 100, 'gamma': 1}
0.790 (+/-0.077) for {'C': 100, 'gamma': 0.1}
0.781 (+/-0.080) for {'C': 100, 'gamma': 0.01}
CPU times: total: 7min 51s
Wall time: 8min 4s


In [85]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_svc_ecfp6counts = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=6)


Best parameters set found on development set: {'C': 1, 'gamma': 0.01}
Best score: 0.756896105732593
Grid scores on development set:

0.500 (+/-0.001) for {'C': 1, 'gamma': 10}
0.662 (+/-0.042) for {'C': 1, 'gamma': 1}
0.753 (+/-0.070) for {'C': 1, 'gamma': 0.1}
0.757 (+/-0.060) for {'C': 1, 'gamma': 0.01}
0.500 (+/-0.001) for {'C': 10, 'gamma': 10}
0.662 (+/-0.043) for {'C': 10, 'gamma': 1}
0.752 (+/-0.073) for {'C': 10, 'gamma': 0.1}
0.753 (+/-0.049) for {'C': 10, 'gamma': 0.01}
0.500 (+/-0.001) for {'C': 100, 'gamma': 10}
0.662 (+/-0.043) for {'C': 100, 'gamma': 1}
0.752 (+/-0.072) for {'C': 100, 'gamma': 0.1}
0.729 (+/-0.041) for {'C': 100, 'gamma': 0.01}
CPU times: total: 1min 8s
Wall time: 16min 39s


In [86]:

%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_svc_rdkit2d = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=1)


Best parameters set found on development set: {'C': 1, 'gamma': 1}
Best score: 0.8141723132585037
Grid scores on development set:

0.777 (+/-0.092) for {'C': 1, 'gamma': 10}
0.814 (+/-0.069) for {'C': 1, 'gamma': 1}
0.760 (+/-0.049) for {'C': 1, 'gamma': 0.1}
0.710 (+/-0.036) for {'C': 1, 'gamma': 0.01}
0.769 (+/-0.083) for {'C': 10, 'gamma': 10}
0.812 (+/-0.074) for {'C': 10, 'gamma': 1}
0.796 (+/-0.066) for {'C': 10, 'gamma': 0.1}
0.745 (+/-0.046) for {'C': 10, 'gamma': 0.01}
0.768 (+/-0.080) for {'C': 100, 'gamma': 10}
0.775 (+/-0.080) for {'C': 100, 'gamma': 1}
0.805 (+/-0.075) for {'C': 100, 'gamma': 0.1}
0.772 (+/-0.064) for {'C': 100, 'gamma': 0.01}
CPU times: total: 8min 3s
Wall time: 8min 19s


In [87]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_svc_mordred = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=1)


Best parameters set found on development set: {'C': 10, 'gamma': 0.1}
Best score: 0.8216012931247432
Grid scores on development set:

0.750 (+/-0.090) for {'C': 1, 'gamma': 10}
0.814 (+/-0.075) for {'C': 1, 'gamma': 1}
0.803 (+/-0.063) for {'C': 1, 'gamma': 0.1}
0.758 (+/-0.066) for {'C': 1, 'gamma': 0.01}
0.747 (+/-0.090) for {'C': 10, 'gamma': 10}
0.797 (+/-0.091) for {'C': 10, 'gamma': 1}
0.822 (+/-0.060) for {'C': 10, 'gamma': 0.1}
0.784 (+/-0.066) for {'C': 10, 'gamma': 0.01}
0.747 (+/-0.090) for {'C': 100, 'gamma': 10}
0.789 (+/-0.092) for {'C': 100, 'gamma': 1}
0.806 (+/-0.067) for {'C': 100, 'gamma': 0.1}
0.797 (+/-0.059) for {'C': 100, 'gamma': 0.01}
CPU times: total: 18min 44s
Wall time: 19min 21s


Endpoint 2: EPA

In [88]:
encoder_epa = joblib.load('../encoder_models/encoder_epa.joblib')

In [89]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_svc_ecfp6bits = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True, n_jobs=6)


Best parameters set found on development set: {'C': 10, 'gamma': 0.01}
Best score: 0.502214881274199
Grid scores on development set:

0.335 (+/-0.002) for {'C': 1, 'gamma': 10}
0.335 (+/-0.002) for {'C': 1, 'gamma': 1}
0.385 (+/-0.032) for {'C': 1, 'gamma': 0.1}
0.473 (+/-0.071) for {'C': 1, 'gamma': 0.01}
0.335 (+/-0.002) for {'C': 10, 'gamma': 10}
0.334 (+/-0.002) for {'C': 10, 'gamma': 1}
0.415 (+/-0.041) for {'C': 10, 'gamma': 0.1}
0.502 (+/-0.046) for {'C': 10, 'gamma': 0.01}
0.335 (+/-0.002) for {'C': 100, 'gamma': 10}
0.334 (+/-0.002) for {'C': 100, 'gamma': 1}
0.415 (+/-0.041) for {'C': 100, 'gamma': 0.1}
0.473 (+/-0.048) for {'C': 100, 'gamma': 0.01}
CPU times: total: 2min 26s
Wall time: 32min 17s


In [90]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_svc_ecfp6counts = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True, n_jobs=6)


Best parameters set found on development set: {'C': 10, 'gamma': 0.01}
Best score: 0.507425711140356
Grid scores on development set:

0.335 (+/-0.002) for {'C': 1, 'gamma': 10}
0.335 (+/-0.002) for {'C': 1, 'gamma': 1}
0.361 (+/-0.013) for {'C': 1, 'gamma': 0.1}
0.468 (+/-0.057) for {'C': 1, 'gamma': 0.01}
0.335 (+/-0.002) for {'C': 10, 'gamma': 10}
0.335 (+/-0.002) for {'C': 10, 'gamma': 1}
0.383 (+/-0.019) for {'C': 10, 'gamma': 0.1}
0.507 (+/-0.047) for {'C': 10, 'gamma': 0.01}
0.335 (+/-0.002) for {'C': 100, 'gamma': 10}
0.335 (+/-0.002) for {'C': 100, 'gamma': 1}
0.383 (+/-0.019) for {'C': 100, 'gamma': 0.1}
0.476 (+/-0.045) for {'C': 100, 'gamma': 0.01}
CPU times: total: 2min 2s
Wall time: 25min 19s


In [91]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_svc_maccs = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)


Best parameters set found on development set: {'C': 100, 'gamma': 0.1}
Best score: 0.538085334634762
Grid scores on development set:

0.349 (+/-0.012) for {'C': 1, 'gamma': 10}
0.362 (+/-0.038) for {'C': 1, 'gamma': 1}
0.532 (+/-0.089) for {'C': 1, 'gamma': 0.1}
0.443 (+/-0.019) for {'C': 1, 'gamma': 0.01}
0.349 (+/-0.012) for {'C': 10, 'gamma': 10}
0.375 (+/-0.038) for {'C': 10, 'gamma': 1}
0.538 (+/-0.039) for {'C': 10, 'gamma': 0.1}
0.526 (+/-0.042) for {'C': 10, 'gamma': 0.01}
0.349 (+/-0.012) for {'C': 100, 'gamma': 10}
0.375 (+/-0.038) for {'C': 100, 'gamma': 1}
0.538 (+/-0.034) for {'C': 100, 'gamma': 0.1}
0.507 (+/-0.043) for {'C': 100, 'gamma': 0.01}
CPU times: total: 15.9 s
Wall time: 3min 35s


In [92]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_svc_rdkit2d = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)



Best parameters set found on development set: {'C': 10, 'gamma': 1}
Best score: 0.5396127137198012
Grid scores on development set:

0.450 (+/-0.090) for {'C': 1, 'gamma': 10}
0.501 (+/-0.079) for {'C': 1, 'gamma': 1}
0.364 (+/-0.027) for {'C': 1, 'gamma': 0.1}
0.335 (+/-0.002) for {'C': 1, 'gamma': 0.01}
0.480 (+/-0.086) for {'C': 10, 'gamma': 10}
0.540 (+/-0.056) for {'C': 10, 'gamma': 1}
0.469 (+/-0.058) for {'C': 10, 'gamma': 0.1}
0.352 (+/-0.016) for {'C': 10, 'gamma': 0.01}
0.481 (+/-0.084) for {'C': 100, 'gamma': 10}
0.514 (+/-0.056) for {'C': 100, 'gamma': 1}
0.523 (+/-0.055) for {'C': 100, 'gamma': 0.1}
0.435 (+/-0.033) for {'C': 100, 'gamma': 0.01}
CPU times: total: 14.9 s
Wall time: 3min 15s


In [93]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_svc_mordred = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)


Best parameters set found on development set: {'C': 10, 'gamma': 1}
Best score: 0.5365768484152484
Grid scores on development set:

0.348 (+/-0.019) for {'C': 1, 'gamma': 10}
0.523 (+/-0.085) for {'C': 1, 'gamma': 1}
0.450 (+/-0.078) for {'C': 1, 'gamma': 0.1}
0.338 (+/-0.005) for {'C': 1, 'gamma': 0.01}
0.365 (+/-0.039) for {'C': 10, 'gamma': 10}
0.537 (+/-0.056) for {'C': 10, 'gamma': 1}
0.535 (+/-0.056) for {'C': 10, 'gamma': 0.1}
0.426 (+/-0.073) for {'C': 10, 'gamma': 0.01}
0.365 (+/-0.039) for {'C': 100, 'gamma': 10}
0.532 (+/-0.053) for {'C': 100, 'gamma': 1}
0.534 (+/-0.050) for {'C': 100, 'gamma': 0.1}
0.487 (+/-0.073) for {'C': 100, 'gamma': 0.01}
CPU times: total: 48.7 s
Wall time: 7min 34s


In [94]:
df_epa_svc_mordred = result_model_selection(epa_svc_mordred, 'epa_svc_mordred')

Endpoint 3: logLD50

In [95]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_ecfp6bits = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True, n_jobs = 6)


Best parameters set found on development set: {'C': 1, 'gamma': 0.01}
Best score: -0.5503405736122804
Grid scores on development set:

-0.827 (+/-0.309) for {'C': 1, 'gamma': 10}
-0.827 (+/-0.309) for {'C': 1, 'gamma': 1}
-0.627 (+/-0.226) for {'C': 1, 'gamma': 0.1}
-0.550 (+/-0.192) for {'C': 1, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 10, 'gamma': 10}
-0.823 (+/-0.277) for {'C': 10, 'gamma': 1}
-0.613 (+/-0.202) for {'C': 10, 'gamma': 0.1}
-0.583 (+/-0.185) for {'C': 10, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 100, 'gamma': 10}
-0.823 (+/-0.277) for {'C': 100, 'gamma': 1}
-0.613 (+/-0.202) for {'C': 100, 'gamma': 0.1}
-0.666 (+/-0.142) for {'C': 100, 'gamma': 0.01}
CPU times: total: 51.7 s
Wall time: 10min 31s


In [96]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_ecfp6counts = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True, n_jobs = 6)



Best parameters set found on development set: {'C': 10, 'gamma': 0.01}
Best score: -0.5453515648632298
Grid scores on development set:

-0.827 (+/-0.309) for {'C': 1, 'gamma': 10}
-0.827 (+/-0.309) for {'C': 1, 'gamma': 1}
-0.675 (+/-0.262) for {'C': 1, 'gamma': 0.1}
-0.565 (+/-0.187) for {'C': 1, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 10, 'gamma': 10}
-0.823 (+/-0.278) for {'C': 10, 'gamma': 1}
-0.661 (+/-0.239) for {'C': 10, 'gamma': 0.1}
-0.545 (+/-0.164) for {'C': 10, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 100, 'gamma': 10}
-0.823 (+/-0.278) for {'C': 100, 'gamma': 1}
-0.661 (+/-0.239) for {'C': 100, 'gamma': 0.1}
-0.591 (+/-0.133) for {'C': 100, 'gamma': 0.01}
CPU times: total: 38.4 s
Wall time: 8min


In [97]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_maccs = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'C': 10, 'gamma': 0.1}
Best score: -0.45290928375622946
Grid scores on development set:

-0.812 (+/-0.279) for {'C': 1, 'gamma': 10}
-0.766 (+/-0.262) for {'C': 1, 'gamma': 1}
-0.464 (+/-0.141) for {'C': 1, 'gamma': 0.1}
-0.522 (+/-0.140) for {'C': 1, 'gamma': 0.01}
-0.808 (+/-0.254) for {'C': 10, 'gamma': 10}
-0.754 (+/-0.243) for {'C': 10, 'gamma': 1}
-0.453 (+/-0.084) for {'C': 10, 'gamma': 0.1}
-0.478 (+/-0.163) for {'C': 10, 'gamma': 0.01}
-0.808 (+/-0.254) for {'C': 100, 'gamma': 10}
-0.754 (+/-0.243) for {'C': 100, 'gamma': 1}
-0.457 (+/-0.081) for {'C': 100, 'gamma': 0.1}
-0.554 (+/-0.176) for {'C': 100, 'gamma': 0.01}
CPU times: total: 5.77 s
Wall time: 1min 15s


In [98]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_rdkit2d = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'C': 1, 'gamma': 1}
Best score: -0.49409442627621375
Grid scores on development set:

-0.593 (+/-0.224) for {'C': 1, 'gamma': 10}
-0.494 (+/-0.148) for {'C': 1, 'gamma': 1}
-0.590 (+/-0.171) for {'C': 1, 'gamma': 0.1}
-0.687 (+/-0.269) for {'C': 1, 'gamma': 0.01}
-0.580 (+/-0.211) for {'C': 10, 'gamma': 10}
-0.499 (+/-0.179) for {'C': 10, 'gamma': 1}
-0.533 (+/-0.139) for {'C': 10, 'gamma': 0.1}
-0.620 (+/-0.180) for {'C': 10, 'gamma': 0.01}
-0.580 (+/-0.212) for {'C': 100, 'gamma': 10}
-0.592 (+/-0.226) for {'C': 100, 'gamma': 1}
-0.530 (+/-0.167) for {'C': 100, 'gamma': 0.1}
-0.576 (+/-0.147) for {'C': 100, 'gamma': 0.01}
CPU times: total: 4.73 s
Wall time: 1min 8s


In [99]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_mordred = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)


Best parameters set found on development set: {'C': 10, 'gamma': 0.1}
Best score: -0.46021540702351055
Grid scores on development set:

-0.788 (+/-0.294) for {'C': 1, 'gamma': 10}
-0.473 (+/-0.155) for {'C': 1, 'gamma': 1}
-0.496 (+/-0.098) for {'C': 1, 'gamma': 0.1}
-0.601 (+/-0.176) for {'C': 1, 'gamma': 0.01}
-0.777 (+/-0.267) for {'C': 10, 'gamma': 10}
-0.471 (+/-0.141) for {'C': 10, 'gamma': 1}
-0.460 (+/-0.102) for {'C': 10, 'gamma': 0.1}
-0.548 (+/-0.127) for {'C': 10, 'gamma': 0.01}
-0.777 (+/-0.267) for {'C': 100, 'gamma': 10}
-0.476 (+/-0.141) for {'C': 100, 'gamma': 1}
-0.537 (+/-0.161) for {'C': 100, 'gamma': 0.1}
-0.507 (+/-0.120) for {'C': 100, 'gamma': 0.01}
CPU times: total: 11 s
Wall time: 2min 23s


Random Forest Model
Endpoint 1: toxic

In [30]:
encoder_toxic = joblib.load('../encoder_models/encoder_toxic.joblib')

In [101]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_rf_ecfp6bits = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=20,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: 0.7962032805553361
Grid scores on development set:

0.762 (+/-0.077) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': True}
0.795 (+/-0.067) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
0.787 (+/-0.072) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': True}
0.733 (+/-0.079) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.733 (+/-0.074) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.7

In [102]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_rf_rdkit2d = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
Best score: 0.8377562807671433
Grid scores on development set:

0.837 (+/-0.059) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
0.767 (+/-0.064) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.833 (+/-0.059) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': True}
0.830 (+/-0.057) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.831 (+/-0.058) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
0.

In [103]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_rf_mordred = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: 0.8350463294079058
Grid scores on development set:

0.783 (+/-0.063) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': True}
0.832 (+/-0.060) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
0.831 (+/-0.061) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
0.829 (+/-0.062) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
0.829 (+/-0.061) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': True}
0.

In [104]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_rf_ecfp6count = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=20,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: 0.7997123705018435
Grid scores on development set:

0.779 (+/-0.064) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': True}
0.710 (+/-0.063) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
0.761 (+/-0.066) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.760 (+/-0.067) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.792 (+/-0.062) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
0

In [31]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_rf_maccs = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
Best score: 0.8220226523893197
Grid scores on development set:

0.822 (+/-0.067) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
0.822 (+/-0.068) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
0.822 (+/-0.065) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
0.822 (+/-0.065) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.760 (+/-0.073) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False

Endpoint 2: EPA

In [105]:
encoder_epa = joblib.load('../encoder_models/encoder_epa.joblib')

In [106]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_rf_ecfp6bits = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
Best score: 0.4962855434736273
Grid scores on development set:

0.457 (+/-0.066) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
0.480 (+/-0.057) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
0.334 (+/-0.000) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': True}
0.374 (+/-0.024) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': True}
0.377 (+/-0.019) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': False}
0.

In [107]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_rf_maccs = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
Best score: 0.5387384859360687
Grid scores on development set:

0.498 (+/-0.080) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
0.498 (+/-0.076) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'bootstrap': True}
0.393 (+/-0.032) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False}
0.500 (+/-0.074) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': True}
0.479 (+/-0.074) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': None, 'bootstrap': True}
0.

In [108]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_rf_ecfp6count = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: 0.49182092154551615
Grid scores on development set:

0.492 (+/-0.055) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
0.357 (+/-0.016) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.442 (+/-0.057) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
0.339 (+/-0.009) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False}
0.467 (+/-0.053) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': Fa

In [109]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_rf_rdkit2d = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': False}
Best score: 0.544268835812836
Grid scores on development set:

0.489 (+/-0.056) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.512 (+/-0.068) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
0.531 (+/-0.059) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': True}
0.522 (+/-0.063) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': True}
0.534 (+/-0.054) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
0.534 (

In [110]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_rf_mordred = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)



Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
Best score: 0.5431836837060968
Grid scores on development set:

0.533 (+/-0.073) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
0.527 (+/-0.073) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': True}
0.529 (+/-0.072) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None, 'bootstrap': True}
0.527 (+/-0.077) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
0.543 (+/-0.064) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': Fals

Endpoint 3: logLD50

In [111]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_ecfp6bits = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
Best score: -0.5374327338246813
Grid scores on development set:

-0.549 (+/-0.171) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': True}
-0.683 (+/-0.225) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
-0.543 (+/-0.166) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
-0.774 (+/-0.266) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
-0.719 (+/-0.229) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False

In [112]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_maccs = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: -0.443660837367564
Grid scores on development set:

-0.481 (+/-0.121) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': True}
-0.467 (+/-0.116) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
-0.456 (+/-0.110) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
-0.506 (+/-0.133) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
-0.489 (+/-0.127) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap'

In [113]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_ecfp6count = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: -0.5408206640969156
Grid scores on development set:

-0.632 (+/-0.219) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': True}
-0.776 (+/-0.271) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': True}
-0.586 (+/-0.186) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
-0.727 (+/-0.240) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
-0.774 (+/-0.269) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': Fal

In [114]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_rdkit2d = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
Best score: -0.46541088612250886
Grid scores on development set:

-0.470 (+/-0.137) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
-0.502 (+/-0.154) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': True}
-0.471 (+/-0.138) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
-0.492 (+/-0.149) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
-0.484 (+/-0.144) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootst

In [115]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_mordred = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)



Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: -0.45110994944203975
Grid scores on development set:

-0.492 (+/-0.136) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': True}
-0.486 (+/-0.143) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
-0.469 (+/-0.127) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
-0.451 (+/-0.124) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
-0.479 (+/-0.137) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': Tr

XGBoost
Endpoint 1: logLD50

In [116]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_ecfp6bits = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)


Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.9}
Best score: -0.49743116439673163
Grid scores on development set:

-0.520 (+/-0.178) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.9}
-0.581 (+/-0.195) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.5}
-0.530 (+/-0.169) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.7}
-0.522 (+/-0.174) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.6}
-0.599 (+/-0.205) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'le

In [117]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_maccs = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)


Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: -0.4310792218926224
Grid scores on development set:

-0.562 (+/-0.159) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.9}
-0.562 (+/-0.170) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
-0.524 (+/-0.149) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.5}
-0.479 (+/-0.078) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
-0.553 (+/-0.160) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'lear

In [118]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_ecfp6count = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)


Best parameters set found on development set: {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: -0.4954868664317701
Grid scores on development set:

-0.589 (+/-0.194) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.7}
-0.585 (+/-0.185) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
-0.535 (+/-0.160) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.8}
-0.526 (+/-0.166) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.7}
-0.534 (+/-0.177) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 

In [119]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_rdkit2d = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)


Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
Best score: -0.4468637779795599
Grid scores on development set:

-0.481 (+/-0.118) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
-0.477 (+/-0.122) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 1.0}
-0.520 (+/-0.148) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
-0.467 (+/-0.129) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.5}
-0.477 (+/-0.123) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 

In [120]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_mordred = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)


Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best score: -0.42682088858473594
Grid scores on development set:

-0.428 (+/-0.103) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
-0.481 (+/-0.137) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.6}
-0.495 (+/-0.137) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.5}
-0.427 (+/-0.101) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
-0.474 (+/-0.100) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 

Endpoint 2: toxic

In [None]:
encoder_toxic = joblib.load('../encoder_models/encoder_toxic.joblib')

In [19]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_xgb_ecfp6bits = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
Best score: 0.779149971311067
Grid scores on development set:

0.767 (+/-0.063) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
0.755 (+/-0.048) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.7}
0.765 (+/-0.056) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
0.754 (+/-0.050) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.8}
0.764 (+/-0.053) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_ra

In [20]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_xgb_maccs = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
Best score: 0.8204572276771425
Grid scores on development set:

0.803 (+/-0.073) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}
0.804 (+/-0.074) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}
0.796 (+/-0.068) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
0.779 (+/-0.068) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.799 (+/-0.074) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learni

In [21]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_xgb_rdkit2d = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
Best score: 0.8300594548863851
Grid scores on development set:

0.807 (+/-0.058) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.8}
0.812 (+/-0.063) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
0.828 (+/-0.063) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.8}
0.809 (+/-0.058) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
0.811 (+/-0.063) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learnin

In [22]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_xgb_mordred = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.7}
Best score: 0.8345831522387966
Grid scores on development set:

0.832 (+/-0.061) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.816 (+/-0.068) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.7}
0.813 (+/-0.067) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 1.0}
0.801 (+/-0.067) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
0.826 (+/-0.064) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learn

In [23]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_xgb_ecfp6count = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.9}
Best score: 0.7828889119910382
Grid scores on development set:

0.758 (+/-0.068) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.8}
0.747 (+/-0.059) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
0.762 (+/-0.060) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
0.761 (+/-0.054) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
0.734 (+/-0.066) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learnin

Endpoint 3: EPA

In [24]:
encoder_epa = joblib.load('../encoder_models/encoder_epa.joblib')


In [25]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_ecfp6bits = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
Best score: 0.5168191594611219
Grid scores on development set:

0.475 (+/-0.049) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.8}
0.502 (+/-0.052) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 1.0}
0.497 (+/-0.048) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.6}
0.503 (+/-0.048) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}
0.499 (+/-0.046) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_r

In [26]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_maccs = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: 0.5367818120130525
Grid scores on development set:

0.534 (+/-0.024) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
0.461 (+/-0.047) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.527 (+/-0.025) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
0.531 (+/-0.053) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
0.447 (+/-0.062) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learni

In [27]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_ecfp6count = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
Best score: 0.5164297901337191
Grid scores on development set:

0.408 (+/-0.041) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.464 (+/-0.038) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
0.506 (+/-0.043) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.9}
0.414 (+/-0.044) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
0.510 (+/-0.044) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_

In [28]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_rdkit2d = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: 0.5437794552662495
Grid scores on development set:

0.543 (+/-0.042) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
0.457 (+/-0.023) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.7}
0.530 (+/-0.063) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.6}
0.536 (+/-0.043) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}
0.502 (+/-0.050) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning

In [29]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_mordred = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)


Best parameters set found on development set: {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
Best score: 0.5482312937748454
Grid scores on development set:

0.542 (+/-0.071) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.9}
0.536 (+/-0.068) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.8}
0.520 (+/-0.077) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.548 (+/-0.064) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
0.545 (+/-0.068) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learni