# Setup

## Mount Drive


In [73]:
from google.colab import drive
# drive.mount('/content/drive')

In [74]:
%cd drive/'My Drive'/repositories/moleculenet/notebooks

[Errno 2] No such file or directory: 'drive/My Drive/repositories/moleculenet/notebooks'
/content/drive/My Drive/repositories/moleculenet/notebooks


In [75]:
%ls

colab_modelling_pipeline.ipynb
colab_RF_CIs_on_fingerprints_initial.ipynb
get_ecfp4_ecfp6_final.ipynb
get_original_id_smile_target_esol.ipynb
get_original_id_smile_target_freeesolv.ipynb
get_original_id_smile_target_lipophilicity.ipynb
get_protenated_from_canonical.ipynb
get_rdkit_descriptors_final.ipynb


In [76]:
%ls ../data/

esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original.csv
esol_original_extra_features.csv
esol_original_IdSmilesLabels.csv
esol_original_rdkit_features.csv
ESOL_README
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
FreeSolv_README
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv
Lipo_README


## Import modules

In [77]:
import warnings
warnings.filterwarnings('ignore')

# standard modules
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# evaluating metrics
from scipy.stats import pearsonr
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import jaccard_score # Tanimoto

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

# preprocessing
from sklearn.feature_selection import VarianceThreshold # to remove zero-var features
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.neural_network import MLPRegressor

from sklearn.kernel_ridge import KernelRidge

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, Matern


# pipelines
# https://scikit-learn.org/stable/modules/compose.html#combining-estimators
from sklearn.pipeline import make_pipeline, Pipeline

In [78]:
import sklearn
print(sklearn.__version__)

0.23.1


In [79]:
try:
    import forestci as fci
except ImportError as e:
    ! pip install forestci
    import forestci as fci

In [80]:
print('sklearn.__version__ :', sklearn.__version__)
print('fci.__version__ :', fci.__version__)

sklearn.__version__ : 0.23.1
fci.__version__ : 0.4.1


## Set plotting style

In [81]:
%matplotlib inline
plt.style.use('fivethirtyeight')

plt.rcParams['axes.facecolor']='w'
#plt.rcParams['axes.linewidth']=1
plt.rcParams['axes.edgecolor']='w'
plt.rcParams['figure.facecolor']='w'
plt.rcParams['savefig.facecolor']='w'
#plt.rcParams['grid.color']='white'

# Load and Prepare Data

In [82]:
dataset = 'freesolv'
smile_type = 'original'

# READONLY
assert dataset in ['esol', 'freesolv', 'lipophilicity']
assert smile_type in ['original', 'protonated']

## Load Features

In [83]:
# original data
id_smile_target = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmilesLabels.csv', index_col=0)

# features
rdkit_features = pd.read_csv(f'../data/{dataset}_{smile_type}_rdkit_features.csv', index_col=0)

ecfp4_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp4_features.csv', index_col=0)
ecfp6_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp6_features.csv', index_col=0)

ecfp4_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp4_features.csv', index_col=0)
ecfp6_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp6_features.csv', index_col=0)

# load target
labels = id_smile_target['labels']

In [84]:
print('rdkit_features.shape:      ', rdkit_features.shape)
print('ecfp4_1024_features.shape: ', ecfp4_1024_features.shape)
print('ecfp6_1024_features.shape: ', ecfp6_1024_features.shape)
print('ecfp4_2048_features.shape: ', ecfp4_2048_features.shape)
print('ecfp6_2048_features.shape: ', ecfp6_2048_features.shape)
print('labels.shape:              ', labels.shape)

rdkit_features.shape:       (642, 200)
ecfp4_1024_features.shape:  (642, 1024)
ecfp6_1024_features.shape:  (642, 1024)
ecfp4_2048_features.shape:  (642, 2048)
ecfp6_2048_features.shape:  (642, 2048)
labels.shape:               (642,)


In [85]:
rdkit_features.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,...,fr_hdrzine,fr_hdrzone,fr_imidazole,fr_imide,fr_isocyan,fr_isothiocyan,fr_ketone,fr_ketone_Topliss,fr_lactam,fr_lactone,fr_methoxy,fr_morpholine,fr_nitrile,fr_nitro,fr_nitro_arom,fr_nitro_arom_nonortho,fr_nitroso,fr_oxazole,fr_oxime,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_phos_acid,fr_phos_ester,fr_piperdine,fr_piperzine,fr_priamide,fr_prisulfonamd,fr_pyridine,fr_quatN,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
"4-methoxy-N,N-dimethyl-benzamide",12.42817,-3.458874,12.42817,0.519264,0.68636,179.219,166.115,179.094629,70.0,0.0,0.252836,-0.496768,0.496768,0.252836,1.384615,2.076923,2.615385,3.67558,674.590985,20.825909,19.26371,6.26371,11.39257,9.533193,3.033193,1.946749,1.946749,1.12108,1.12108,0.570798,0.570798,-1.51,110210.129799,2.108111,3.93736,2.211653,96.190689,9.636773,5.749512,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
methanesulfonyl chloride,9.85571,-4.368056,9.85571,3.074846,0.421427,114.553,111.529,113.954228,32.0,0.0,0.229212,-0.212518,0.229212,0.212518,2.0,2.2,2.2,4.534785,166.212669,7.0,5.102709,3.675135,3.25,2.191761,3.001103,0.437848,2.532383,0.0,0.0,0.0,0.0,0.24,21.306059,1.797156,1.143107,369.351111,39.38663,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-methylbut-1-ene,7.349537,-3.289005,7.349537,1.280324,0.412737,70.135,60.055,70.07825,30.0,0.0,0.057236,-0.102824,0.102824,0.057236,1.8,2.6,2.8,5.772015,248.517785,12.654701,12.5,2.5,6.354059,6.0,1.0,0.5,0.5,0.125,0.125,0.0,0.0,-0.26,644.471039,0.351195,2.00738,3.74,47.559121,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2-ethylpyrazine,7.363796,-3.006484,7.363796,0.587878,0.536795,108.144,100.08,108.068748,42.0,0.0,0.08596,-0.261224,0.261224,0.08596,1.375,2.25,3.125,3.394825,411.724942,12.723615,11.894427,3.894427,7.070386,5.894427,1.894427,1.032624,1.032624,0.553812,0.553812,0.267705,0.267705,-0.92,1798.391122,1.1509,2.403302,1.025681,59.840347,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
heptan-1-ol,7.58125,-4.173333,7.58125,3.423878,0.544191,116.204,100.076,116.120115,50.0,0.0,0.210037,-0.396377,0.396377,0.210037,1.125,1.875,2.625,6.416917,459.536609,20.207107,19.908248,3.908248,10.06066,9.612372,1.704124,0.727062,0.727062,0.301031,0.301031,0.119266,0.119266,-0.04,27745.345015,0.731455,6.96,5.96,74.289336,5.108808,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
ecfp4_1024_features.head()

Unnamed: 0,1024ecfp4-0,1024ecfp4-1,1024ecfp4-2,1024ecfp4-3,1024ecfp4-4,1024ecfp4-5,1024ecfp4-6,1024ecfp4-7,1024ecfp4-8,1024ecfp4-9,1024ecfp4-10,1024ecfp4-11,1024ecfp4-12,1024ecfp4-13,1024ecfp4-14,1024ecfp4-15,1024ecfp4-16,1024ecfp4-17,1024ecfp4-18,1024ecfp4-19,1024ecfp4-20,1024ecfp4-21,1024ecfp4-22,1024ecfp4-23,1024ecfp4-24,1024ecfp4-25,1024ecfp4-26,1024ecfp4-27,1024ecfp4-28,1024ecfp4-29,1024ecfp4-30,1024ecfp4-31,1024ecfp4-32,1024ecfp4-33,1024ecfp4-34,1024ecfp4-35,1024ecfp4-36,1024ecfp4-37,1024ecfp4-38,1024ecfp4-39,...,1024ecfp4-984,1024ecfp4-985,1024ecfp4-986,1024ecfp4-987,1024ecfp4-988,1024ecfp4-989,1024ecfp4-990,1024ecfp4-991,1024ecfp4-992,1024ecfp4-993,1024ecfp4-994,1024ecfp4-995,1024ecfp4-996,1024ecfp4-997,1024ecfp4-998,1024ecfp4-999,1024ecfp4-1000,1024ecfp4-1001,1024ecfp4-1002,1024ecfp4-1003,1024ecfp4-1004,1024ecfp4-1005,1024ecfp4-1006,1024ecfp4-1007,1024ecfp4-1008,1024ecfp4-1009,1024ecfp4-1010,1024ecfp4-1011,1024ecfp4-1012,1024ecfp4-1013,1024ecfp4-1014,1024ecfp4-1015,1024ecfp4-1016,1024ecfp4-1017,1024ecfp4-1018,1024ecfp4-1019,1024ecfp4-1020,1024ecfp4-1021,1024ecfp4-1022,1024ecfp4-1023
"4-methoxy-N,N-dimethyl-benzamide",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
methanesulfonyl chloride,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3-methylbut-1-ene,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2-ethylpyrazine,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
heptan-1-ol,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [87]:
ecfp6_1024_features.head()

Unnamed: 0,1024ecfp6-0,1024ecfp6-1,1024ecfp6-2,1024ecfp6-3,1024ecfp6-4,1024ecfp6-5,1024ecfp6-6,1024ecfp6-7,1024ecfp6-8,1024ecfp6-9,1024ecfp6-10,1024ecfp6-11,1024ecfp6-12,1024ecfp6-13,1024ecfp6-14,1024ecfp6-15,1024ecfp6-16,1024ecfp6-17,1024ecfp6-18,1024ecfp6-19,1024ecfp6-20,1024ecfp6-21,1024ecfp6-22,1024ecfp6-23,1024ecfp6-24,1024ecfp6-25,1024ecfp6-26,1024ecfp6-27,1024ecfp6-28,1024ecfp6-29,1024ecfp6-30,1024ecfp6-31,1024ecfp6-32,1024ecfp6-33,1024ecfp6-34,1024ecfp6-35,1024ecfp6-36,1024ecfp6-37,1024ecfp6-38,1024ecfp6-39,...,1024ecfp6-984,1024ecfp6-985,1024ecfp6-986,1024ecfp6-987,1024ecfp6-988,1024ecfp6-989,1024ecfp6-990,1024ecfp6-991,1024ecfp6-992,1024ecfp6-993,1024ecfp6-994,1024ecfp6-995,1024ecfp6-996,1024ecfp6-997,1024ecfp6-998,1024ecfp6-999,1024ecfp6-1000,1024ecfp6-1001,1024ecfp6-1002,1024ecfp6-1003,1024ecfp6-1004,1024ecfp6-1005,1024ecfp6-1006,1024ecfp6-1007,1024ecfp6-1008,1024ecfp6-1009,1024ecfp6-1010,1024ecfp6-1011,1024ecfp6-1012,1024ecfp6-1013,1024ecfp6-1014,1024ecfp6-1015,1024ecfp6-1016,1024ecfp6-1017,1024ecfp6-1018,1024ecfp6-1019,1024ecfp6-1020,1024ecfp6-1021,1024ecfp6-1022,1024ecfp6-1023
"4-methoxy-N,N-dimethyl-benzamide",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
methanesulfonyl chloride,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3-methylbut-1-ene,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2-ethylpyrazine,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
heptan-1-ol,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [88]:
ecfp4_2048_features.head()

Unnamed: 0,2048ecfp4-0,2048ecfp4-1,2048ecfp4-2,2048ecfp4-3,2048ecfp4-4,2048ecfp4-5,2048ecfp4-6,2048ecfp4-7,2048ecfp4-8,2048ecfp4-9,2048ecfp4-10,2048ecfp4-11,2048ecfp4-12,2048ecfp4-13,2048ecfp4-14,2048ecfp4-15,2048ecfp4-16,2048ecfp4-17,2048ecfp4-18,2048ecfp4-19,2048ecfp4-20,2048ecfp4-21,2048ecfp4-22,2048ecfp4-23,2048ecfp4-24,2048ecfp4-25,2048ecfp4-26,2048ecfp4-27,2048ecfp4-28,2048ecfp4-29,2048ecfp4-30,2048ecfp4-31,2048ecfp4-32,2048ecfp4-33,2048ecfp4-34,2048ecfp4-35,2048ecfp4-36,2048ecfp4-37,2048ecfp4-38,2048ecfp4-39,...,2048ecfp4-2008,2048ecfp4-2009,2048ecfp4-2010,2048ecfp4-2011,2048ecfp4-2012,2048ecfp4-2013,2048ecfp4-2014,2048ecfp4-2015,2048ecfp4-2016,2048ecfp4-2017,2048ecfp4-2018,2048ecfp4-2019,2048ecfp4-2020,2048ecfp4-2021,2048ecfp4-2022,2048ecfp4-2023,2048ecfp4-2024,2048ecfp4-2025,2048ecfp4-2026,2048ecfp4-2027,2048ecfp4-2028,2048ecfp4-2029,2048ecfp4-2030,2048ecfp4-2031,2048ecfp4-2032,2048ecfp4-2033,2048ecfp4-2034,2048ecfp4-2035,2048ecfp4-2036,2048ecfp4-2037,2048ecfp4-2038,2048ecfp4-2039,2048ecfp4-2040,2048ecfp4-2041,2048ecfp4-2042,2048ecfp4-2043,2048ecfp4-2044,2048ecfp4-2045,2048ecfp4-2046,2048ecfp4-2047
"4-methoxy-N,N-dimethyl-benzamide",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
methanesulfonyl chloride,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3-methylbut-1-ene,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2-ethylpyrazine,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
heptan-1-ol,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [89]:
ecfp6_2048_features.head()

Unnamed: 0,2048ecfp6-0,2048ecfp6-1,2048ecfp6-2,2048ecfp6-3,2048ecfp6-4,2048ecfp6-5,2048ecfp6-6,2048ecfp6-7,2048ecfp6-8,2048ecfp6-9,2048ecfp6-10,2048ecfp6-11,2048ecfp6-12,2048ecfp6-13,2048ecfp6-14,2048ecfp6-15,2048ecfp6-16,2048ecfp6-17,2048ecfp6-18,2048ecfp6-19,2048ecfp6-20,2048ecfp6-21,2048ecfp6-22,2048ecfp6-23,2048ecfp6-24,2048ecfp6-25,2048ecfp6-26,2048ecfp6-27,2048ecfp6-28,2048ecfp6-29,2048ecfp6-30,2048ecfp6-31,2048ecfp6-32,2048ecfp6-33,2048ecfp6-34,2048ecfp6-35,2048ecfp6-36,2048ecfp6-37,2048ecfp6-38,2048ecfp6-39,...,2048ecfp6-2008,2048ecfp6-2009,2048ecfp6-2010,2048ecfp6-2011,2048ecfp6-2012,2048ecfp6-2013,2048ecfp6-2014,2048ecfp6-2015,2048ecfp6-2016,2048ecfp6-2017,2048ecfp6-2018,2048ecfp6-2019,2048ecfp6-2020,2048ecfp6-2021,2048ecfp6-2022,2048ecfp6-2023,2048ecfp6-2024,2048ecfp6-2025,2048ecfp6-2026,2048ecfp6-2027,2048ecfp6-2028,2048ecfp6-2029,2048ecfp6-2030,2048ecfp6-2031,2048ecfp6-2032,2048ecfp6-2033,2048ecfp6-2034,2048ecfp6-2035,2048ecfp6-2036,2048ecfp6-2037,2048ecfp6-2038,2048ecfp6-2039,2048ecfp6-2040,2048ecfp6-2041,2048ecfp6-2042,2048ecfp6-2043,2048ecfp6-2044,2048ecfp6-2045,2048ecfp6-2046,2048ecfp6-2047
"4-methoxy-N,N-dimethyl-benzamide",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
methanesulfonyl chloride,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3-methylbut-1-ene,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2-ethylpyrazine,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
heptan-1-ol,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [90]:
labels.head()

id
4-methoxy-N,N-dimethyl-benzamide   -11.01
methanesulfonyl chloride            -4.87
3-methylbut-1-ene                    1.83
2-ethylpyrazine                     -5.45
heptan-1-ol                         -4.21
Name: labels, dtype: float64

In [91]:
# Establish Feature Sets

In [92]:
all_features = pd.concat([rdkit_features,
                          ecfp4_1024_features, ecfp6_1024_features,
                          ecfp4_2048_features, ecfp6_2048_features],
                         axis='columns')

In [93]:
all_features.shape

(642, 6344)

In [94]:
all_features.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,...,2048ecfp6-2008,2048ecfp6-2009,2048ecfp6-2010,2048ecfp6-2011,2048ecfp6-2012,2048ecfp6-2013,2048ecfp6-2014,2048ecfp6-2015,2048ecfp6-2016,2048ecfp6-2017,2048ecfp6-2018,2048ecfp6-2019,2048ecfp6-2020,2048ecfp6-2021,2048ecfp6-2022,2048ecfp6-2023,2048ecfp6-2024,2048ecfp6-2025,2048ecfp6-2026,2048ecfp6-2027,2048ecfp6-2028,2048ecfp6-2029,2048ecfp6-2030,2048ecfp6-2031,2048ecfp6-2032,2048ecfp6-2033,2048ecfp6-2034,2048ecfp6-2035,2048ecfp6-2036,2048ecfp6-2037,2048ecfp6-2038,2048ecfp6-2039,2048ecfp6-2040,2048ecfp6-2041,2048ecfp6-2042,2048ecfp6-2043,2048ecfp6-2044,2048ecfp6-2045,2048ecfp6-2046,2048ecfp6-2047
"4-methoxy-N,N-dimethyl-benzamide",12.42817,-3.458874,12.42817,0.519264,0.68636,179.219,166.115,179.094629,70.0,0.0,0.252836,-0.496768,0.496768,0.252836,1.384615,2.076923,2.615385,3.67558,674.590985,20.825909,19.26371,6.26371,11.39257,9.533193,3.033193,1.946749,1.946749,1.12108,1.12108,0.570798,0.570798,-1.51,110210.129799,2.108111,3.93736,2.211653,96.190689,9.636773,5.749512,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
methanesulfonyl chloride,9.85571,-4.368056,9.85571,3.074846,0.421427,114.553,111.529,113.954228,32.0,0.0,0.229212,-0.212518,0.229212,0.212518,2.0,2.2,2.2,4.534785,166.212669,7.0,5.102709,3.675135,3.25,2.191761,3.001103,0.437848,2.532383,0.0,0.0,0.0,0.0,0.24,21.306059,1.797156,1.143107,369.351111,39.38663,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3-methylbut-1-ene,7.349537,-3.289005,7.349537,1.280324,0.412737,70.135,60.055,70.07825,30.0,0.0,0.057236,-0.102824,0.102824,0.057236,1.8,2.6,2.8,5.772015,248.517785,12.654701,12.5,2.5,6.354059,6.0,1.0,0.5,0.5,0.125,0.125,0.0,0.0,-0.26,644.471039,0.351195,2.00738,3.74,47.559121,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2-ethylpyrazine,7.363796,-3.006484,7.363796,0.587878,0.536795,108.144,100.08,108.068748,42.0,0.0,0.08596,-0.261224,0.261224,0.08596,1.375,2.25,3.125,3.394825,411.724942,12.723615,11.894427,3.894427,7.070386,5.894427,1.894427,1.032624,1.032624,0.553812,0.553812,0.267705,0.267705,-0.92,1798.391122,1.1509,2.403302,1.025681,59.840347,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
heptan-1-ol,7.58125,-4.173333,7.58125,3.423878,0.544191,116.204,100.076,116.120115,50.0,0.0,0.210037,-0.396377,0.396377,0.210037,1.125,1.875,2.625,6.416917,459.536609,20.207107,19.908248,3.908248,10.06066,9.612372,1.704124,0.727062,0.727062,0.301031,0.301031,0.119266,0.119266,-0.04,27745.345015,0.731455,6.96,5.96,74.289336,5.108808,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Create feature_sets dictionary with pd.Index objects (containing feature names) as its elements

### Add original features into feature_sets

In [95]:
feature_sets = {
    'RDKit': rdkit_features.columns,
    '1024ecfp-4': ecfp4_1024_features.columns,
    '1024ecfp-6': ecfp6_1024_features.columns,
    '2048ecfp-4': ecfp4_2048_features.columns,
    '2048ecfp-6': ecfp6_2048_features.columns,
}

### Adding a combination of RDKit features with every feature set in the feature_sets dictionary

In [96]:
for f in ['1024ecfp-4', '1024ecfp-6', '2048ecfp-4', '2048ecfp-6']:
    feature_sets[f'{f} + RDKit'] = feature_sets[f].union(feature_sets['RDKit'])

In [97]:
list(feature_sets.keys())

['RDKit',
 '1024ecfp-4',
 '1024ecfp-6',
 '2048ecfp-4',
 '2048ecfp-6',
 '1024ecfp-4 + RDKit',
 '1024ecfp-6 + RDKit',
 '2048ecfp-4 + RDKit',
 '2048ecfp-6 + RDKit']

In [98]:
[len(feature_sets[key]) for key in feature_sets.keys()]

[200, 1024, 1024, 2048, 2048, 1224, 1224, 2248, 2248]

# Train-Test split: 90/10

**Choose to use a small subset for training**

In [99]:
# if True, use only 100 observations with 90-10 train-test-split for computational efficiency
use_small = False

**Subset the features and shuffle the data beforehand**

In [100]:
# if use_small:
#     n = 100
# else:
#     n = all_features.shape[0]

In [101]:
# np.random.seed(42)
# test = pd.Index(np.random.choice(ready_indexes, size=n//10, replace=False))
# print(test[:5], len(test))

# train = ready_indexes.difference(test)
# print(train[:5], len(train))

In [102]:
# np.random.seed(42)
# ready_indexes = pd.Index(np.random.choice(all_features.index, size=n, replace=False))
# print(ready_indexes[:5], len(ready_indexes))

In [103]:
if use_small:
    working_size = 100
else:
    working_size = all_features.shape[0]

train_test_split_gen = ShuffleSplit(n_splits=1, test_size=0.1, random_state=42).split(all_features.iloc[:working_size])
train, test = next(train_test_split_gen)

train, test = all_features.iloc[train].index, all_features.iloc[test].index

print(test[:5], len(test))
print(train[:5], len(train))

Index(['1-bromo-2-methyl-propane', '1,2,4-trichlorodibenzo-p-dioxin',
       '1-amino-9,10-anthracenedione', 'triethylphosphate',
       '2-(nitrooxy)ethan-1-ol'],
      dtype='object') 65
Index(['cyclohexanamine', 'diphenyl ether', 'ethanol', '3-methylheptane',
       '111-trifluoropropan-2-ol'],
      dtype='object') 577


In [104]:
all_features.iloc[:working_size].shape, labels.iloc[:working_size].shape

((642, 6344), (642,))

# Training

**Choose feature set to use for now, will itarate later**

In [105]:
# main feature set to use for now
f = 'RDKit'

In [106]:
features = all_features.loc[train, feature_sets[f]]
targets = labels.loc[train]

print(features.shape, targets.shape)

(577, 200) (577,)


**Choose metrics to use**

In [107]:
scoring = {'RMSE': 'neg_root_mean_squared_error',
           'MAE': 'neg_mean_absolute_error',
           'rSq': 'r2'}

## Train-Validation split: 8 to 1 (resulting in 80-10-10 in train-val-test)

**Choose the number of times for validation**

In [108]:
num_cross_val = 1

assert isinstance(num_cross_val, int)
assert 1 <= num_cross_val <= 10

**Choose split mode: random, stratified. Note:** random in this case is the K-fold split, for which the elements are shuffled in advance.

In [109]:
split_type = 'random' 

assert isinstance(split_type, str)
assert split_type in ['random', 'stratified']

In [110]:
if split_type == 'random':
    cv = ShuffleSplit(n_splits=num_cross_val, test_size=1/9, random_state=42).split(features)
elif split_type == 'stratified':
    binned = target.apply(lambda x: int(x)) # creating stratified indices
    cv = StratifiedShuffleSplit(n_splits=num_cross_val, test_size=1/9, random_state=42).split(features, binned)

# Randomized Grid Search for parameters

## Separately trying to optimize parameters for NN

In [111]:
"""
# Randomized Grid Search takes a lot of time, use best parameters straight away
# while reproducing the results the first time

nn_params = {
    'nn__hidden_layer_sizes': [(n,) for n in np.arange(5, 100, 5)],
    'nn__activation': ['tanh'],
    'nn__alpha': 10.0 ** -np.arange(1, 7),
    'nn__max_iter': [500, 1000],
}
"""

nn_params = {
    'nn__max_iter': [1000],
    'nn__hidden_layer_sizes': [(50,)],
    'nn__alpha': [0.1],
    'nn__activation': ['tanh']
}

pipe = Pipeline([('zero-var-feature-remover', VarianceThreshold()), 
                 ('scaler', StandardScaler()),
                 ('nn', MLPRegressor())])

nn_estimator = RandomizedSearchCV(pipe, param_distributions=nn_params, cv=cv,
                                  refit=True, n_iter=100, n_jobs=-1, verbose=10,
                                  random_state=42).fit(features, targets)

Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.3s finished


In [112]:
nn_best_params = nn_estimator.best_params_

%store nn_best_params

Stored 'nn_best_params' (dict)


In [113]:
# retrieve 
%store -r nn_best_params
print(nn_best_params)

{'nn__max_iter': 1000, 'nn__hidden_layer_sizes': (50,), 'nn__alpha': 0.1, 'nn__activation': 'tanh'}


In [114]:
estimators = {
    'rf':  RandomForestRegressor(),
    'xgb': XGBRegressor(),
    'krr': KernelRidge()
}

params = {
    'rf': {
        'rf__n_estimators': np.arange(50, 1050, 50),
    },
    'xgb': {
        'xgb__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
        'xgb__max_depth': np.arange(1, 11, 2),
        'xgb__n_estimators': np.arange(50, 550, 50),
        'xgb__subsample': [0.5, 1]
    },
    'krr': {
        'krr__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    }
}

In [None]:
%%time

best_params = {}
cv_scores = {}
test_score = {}

for f in feature_sets:
    print(f'Using {f} features...')
    features = all_features.loc[train, feature_sets[f]]

    best_params[f] = {}
    cv_scores[f] = {}
    test_score[f] = {}


    for e in estimators:
        print(f'\tRandom search optimisation for {e} estimator...')
        
        if split_type == 'random':
            cv = ShuffleSplit(n_splits=num_cross_val, test_size=1/9, random_state=42).split(features)
        elif split_type == 'stratified':
            binned = target.apply(lambda x: int(x)) # creating stratified indices
            cv = StratifiedShuffleSplit(n_splits=num_cross_val, test_size=1/9, random_state=42).split(features, binned)

        pipe = Pipeline([('zero-var-feature-remover', VarianceThreshold()), 
                        ('scaler', StandardScaler()),
                        (e, estimators[e])])

        model = RandomizedSearchCV(pipe, param_distributions=params[e], cv=cv,
                                scoring='neg_mean_squared_error',
                                refit=True, iid='False',
                                n_iter=20, n_jobs=-1,
                                verbose=0,
                                random_state=42).fit(features, targets)

        # get different metrics for the validation set
        if split_type == 'random':
            cv = ShuffleSplit(n_splits=num_cross_val, test_size=1/9, random_state=42).split(features)
        elif split_type == 'stratified':
            binned = target.apply(lambda x: int(x)) # creating stratified indices
            cv = StratifiedShuffleSplit(n_splits=num_cross_val, test_size=1/9, random_state=42).split(features, binned)

        val_results = cross_validate(estimator=model.best_estimator_,
                                    X=features, y=targets, cv=cv,
                                    scoring=scoring, n_jobs=-1,
                                    return_train_score=True)
        cv_scores[f][e] = val_results

        # record best model parameters
        best_params[f][e] = model.best_params_

        # get different metrics for the test set
        train_test_split_gen = ShuffleSplit(n_splits=1, test_size=0.1, random_state=42).split(all_features.iloc[:working_size])
        
        
        test_results = cross_validate(estimator=model.best_estimator_,
                                    X=all_features.iloc[:working_size],
                                    y=labels.iloc[:working_size],
                                    cv=train_test_split_gen,
                                    scoring=scoring, n_jobs=-1)
        test_score[f][e] = test_results

%store best_params
%store cv_scores
%store test_score

Using RDKit features...
	Random search optimisation for rf estimator...


In [None]:
# retrieve 
%store -r best_params
print(best_params)

In [None]:
# retrieve 
%store -r cv_scores
print(cv_scores)

In [None]:
# retrieve 
%store -r test_score
print(pd.DataFrame(pd.DataFrame(test_score).loc['xgb' ,'RDKit']))

In [None]:
type(cv_scores['RDKit']['rf']['fit_time'])

In [None]:
float(cv_scores['RDKit']['rf']['fit_time'])

# Saving results (dictionaries) into JSON files

In [None]:
# JSON encoder for np.int64
def default_params(o):
    if isinstance(o, np.integer):
        return int(o)
    raise TypeError

In [None]:
with open(f'../results/{dataset}_{smile_type}_random_search_best_params.json', 'w') as f:
    json.dump(best_params, f, default=default_params)

In [None]:
# JSON encoder for np.float64
def default_scores(o):
    if isinstance(o, np.ndarray):
        return float(o)
    raise TypeError

In [None]:
with open(f'../results/{dataset}_{smile_type}_random_search_best_cv_scores.json', 'w') as f:
    json.dump(cv_scores, f, default=default_scores)
    
with open(f'../results/{dataset}_{smile_type}_random_search_best_test_score.json', 'w') as f:
    json.dump(test_score, f, default=default_scores)