# Setup (same everywhere)

## Mount Drive


In [1]:
from google.colab import drive
# drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%cd drive/'My Drive'/repositories/moleculenet/notebooks

/content/drive/My Drive/repositories/moleculenet/notebooks


In [1]:
%ls

colab_RF_CIs_on_fingerprints_initial.ipynb
colab_extended_grid_search_pipeline.ipynb
colab_reproducing_grid_search_pipeline.ipynb
get_protenated_from_canonical.ipynb
visualise_grid_search_results.ipynb
visualise_reproducing_grid_search_results.ipynb


In [2]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original_IdSmilesLabels.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv


## Import modules

In [3]:
import warnings
warnings.filterwarnings('ignore')

# saving models
import json
import pickle

# standard modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# metrics
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import jaccard_score # Tanimoto

# making custom metrics
from sklearn.metrics import make_scorer

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

# preprocessing
from sklearn.feature_selection import VarianceThreshold # to remove zero-var features
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.neural_network import MLPRegressor

from sklearn.kernel_ridge import KernelRidge

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel
from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct

# pipelines
# https://scikit-learn.org/stable/modules/compose.html#combining-estimators
from sklearn.pipeline import make_pipeline, Pipeline

In [4]:
import sklearn
print(sklearn.__version__)

0.23.1


In [5]:
try:
    import forestci as fci
except ImportError as e:
    ! pip install forestci
    import forestci as fci

Failed to import duecredit due to No module named 'duecredit'


In [6]:
print('sklearn.__version__ :', sklearn.__version__)
print('fci.__version__ :', fci.__version__)

sklearn.__version__ : 0.23.1
fci.__version__ : 0.3


## Set plotting style

In [7]:
%matplotlib inline
plt.style.use('fivethirtyeight')

plt.rcParams['axes.facecolor']='w'
#plt.rcParams['axes.linewidth']=1
plt.rcParams['axes.edgecolor']='w'
plt.rcParams['figure.facecolor']='w'
plt.rcParams['savefig.facecolor']='w'
#plt.rcParams['grid.color']='white'

# Load Data (same everywhere)

## CHOOSE: dataset, smile_type

In [8]:
dataset = 'freesolv'
smile_type = 'original'

# READONLY

# small to large datasets
assert dataset in ['freesolv', 'esol', 'lipophilicity']
assert smile_type in ['original', 'protonated']

## Load Features and Targets

Leave all features here so setup and Load and prepare data are the same everywhere.

In [9]:
# original data
id_smile_target = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmilesLabels.csv', index_col=0)

# features
rdkit_features = pd.read_csv(f'../data/{dataset}_{smile_type}_rdkit_features.csv', index_col=0)

ecfp4_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp4_features.csv', index_col=0)
ecfp6_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp6_features.csv', index_col=0)

ecfp4_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp4_features.csv', index_col=0)
ecfp6_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp6_features.csv', index_col=0)

# load target
labels = id_smile_target['labels']

In [12]:
print('rdkit_features.shape:      ', rdkit_features.shape)
print('ecfp4_1024_features.shape: ', ecfp4_1024_features.shape)
print('ecfp6_1024_features.shape: ', ecfp6_1024_features.shape)
print('ecfp4_2048_features.shape: ', ecfp4_2048_features.shape)
print('ecfp6_2048_features.shape: ', ecfp6_2048_features.shape)
print('labels.shape:              ', labels.shape)

rdkit_features.shape:       (642, 200)
ecfp4_1024_features.shape:  (642, 1024)
ecfp6_1024_features.shape:  (642, 1024)
ecfp4_2048_features.shape:  (642, 2048)
ecfp6_2048_features.shape:  (642, 2048)
labels.shape:               (642,)


In [14]:
for name in rdkit_features.columns:
    print(name)

MaxEStateIndex
MinEStateIndex
MaxAbsEStateIndex
MinAbsEStateIndex
qed
MolWt
HeavyAtomMolWt
ExactMolWt
NumValenceElectrons
NumRadicalElectrons
MaxPartialCharge
MinPartialCharge
MaxAbsPartialCharge
MinAbsPartialCharge
FpDensityMorgan1
FpDensityMorgan2
FpDensityMorgan3
BalabanJ
BertzCT
Chi0
Chi0n
Chi0v
Chi1
Chi1n
Chi1v
Chi2n
Chi2v
Chi3n
Chi3v
Chi4n
Chi4v
HallKierAlpha
Ipc
Kappa1
Kappa2
Kappa3
LabuteASA
PEOE_VSA1
PEOE_VSA10
PEOE_VSA11
PEOE_VSA12
PEOE_VSA13
PEOE_VSA14
PEOE_VSA2
PEOE_VSA3
PEOE_VSA4
PEOE_VSA5
PEOE_VSA6
PEOE_VSA7
PEOE_VSA8
PEOE_VSA9
SMR_VSA1
SMR_VSA10
SMR_VSA2
SMR_VSA3
SMR_VSA4
SMR_VSA5
SMR_VSA6
SMR_VSA7
SMR_VSA8
SMR_VSA9
SlogP_VSA1
SlogP_VSA10
SlogP_VSA11
SlogP_VSA12
SlogP_VSA2
SlogP_VSA3
SlogP_VSA4
SlogP_VSA5
SlogP_VSA6
SlogP_VSA7
SlogP_VSA8
SlogP_VSA9
TPSA
EState_VSA1
EState_VSA10
EState_VSA11
EState_VSA2
EState_VSA3
EState_VSA4
EState_VSA5
EState_VSA6
EState_VSA7
EState_VSA8
EState_VSA9
VSA_EState1
VSA_EState10
VSA_EState2
VSA_EState3
VSA_EState4
VSA_EState5
VSA_EState6
VS

In [13]:
labels.head()

id
4-methoxy-N,N-dimethyl-benzamide   -11.01
methanesulfonyl chloride            -4.87
3-methylbut-1-ene                    1.83
2-ethylpyrazine                     -5.45
heptan-1-ol                         -4.21
Name: labels, dtype: float64

## Create one DataFrame with all features

In [14]:
all_features = pd.concat([rdkit_features,
                          ecfp4_1024_features, ecfp6_1024_features,
                          ecfp4_2048_features, ecfp6_2048_features],
                         axis='columns')

In [15]:
all_features.shape

(642, 6344)

In [16]:
all_features.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,HallKierAlpha,Ipc,Kappa1,Kappa2,Kappa3,LabuteASA,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,...,2048ecfp6-2008,2048ecfp6-2009,2048ecfp6-2010,2048ecfp6-2011,2048ecfp6-2012,2048ecfp6-2013,2048ecfp6-2014,2048ecfp6-2015,2048ecfp6-2016,2048ecfp6-2017,2048ecfp6-2018,2048ecfp6-2019,2048ecfp6-2020,2048ecfp6-2021,2048ecfp6-2022,2048ecfp6-2023,2048ecfp6-2024,2048ecfp6-2025,2048ecfp6-2026,2048ecfp6-2027,2048ecfp6-2028,2048ecfp6-2029,2048ecfp6-2030,2048ecfp6-2031,2048ecfp6-2032,2048ecfp6-2033,2048ecfp6-2034,2048ecfp6-2035,2048ecfp6-2036,2048ecfp6-2037,2048ecfp6-2038,2048ecfp6-2039,2048ecfp6-2040,2048ecfp6-2041,2048ecfp6-2042,2048ecfp6-2043,2048ecfp6-2044,2048ecfp6-2045,2048ecfp6-2046,2048ecfp6-2047
"4-methoxy-N,N-dimethyl-benzamide",12.42817,-3.458874,12.42817,0.519264,0.68636,179.219,166.115,179.094629,70.0,0.0,0.252836,-0.496768,0.496768,0.252836,1.384615,2.076923,2.615385,3.67558,674.590985,20.825909,19.26371,6.26371,11.39257,9.533193,3.033193,1.946749,1.946749,1.12108,1.12108,0.570798,0.570798,-1.51,110210.129799,2.108111,3.93736,2.211653,96.190689,9.636773,5.749512,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
methanesulfonyl chloride,9.85571,-4.368056,9.85571,3.074846,0.421427,114.553,111.529,113.954228,32.0,0.0,0.229212,-0.212518,0.229212,0.212518,2.0,2.2,2.2,4.534785,166.212669,7.0,5.102709,3.675135,3.25,2.191761,3.001103,0.437848,2.532383,0.0,0.0,0.0,0.0,0.24,21.306059,1.797156,1.143107,369.351111,39.38663,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3-methylbut-1-ene,7.349537,-3.289005,7.349537,1.280324,0.412737,70.135,60.055,70.07825,30.0,0.0,0.057236,-0.102824,0.102824,0.057236,1.8,2.6,2.8,5.772015,248.517785,12.654701,12.5,2.5,6.354059,6.0,1.0,0.5,0.5,0.125,0.125,0.0,0.0,-0.26,644.471039,0.351195,2.00738,3.74,47.559121,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2-ethylpyrazine,7.363796,-3.006484,7.363796,0.587878,0.536795,108.144,100.08,108.068748,42.0,0.0,0.08596,-0.261224,0.261224,0.08596,1.375,2.25,3.125,3.394825,411.724942,12.723615,11.894427,3.894427,7.070386,5.894427,1.894427,1.032624,1.032624,0.553812,0.553812,0.267705,0.267705,-0.92,1798.391122,1.1509,2.403302,1.025681,59.840347,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
heptan-1-ol,7.58125,-4.173333,7.58125,3.423878,0.544191,116.204,100.076,116.120115,50.0,0.0,0.210037,-0.396377,0.396377,0.210037,1.125,1.875,2.625,6.416917,459.536609,20.207107,19.908248,3.908248,10.06066,9.612372,1.704124,0.727062,0.727062,0.301031,0.301031,0.119266,0.119266,-0.04,27745.345015,0.731455,6.96,5.96,74.289336,5.108808,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# CHOOSE: what feature_sets we will iterate over

Create a dictionary containing feature names (index objects) as its elements

In [17]:
# reproducing the paper - so only using the features like in the paper
feature_sets = {
    '1024ecfp-4': ecfp4_1024_features.columns
}

# Train-Validation-Test split

## CHOOSE: use_small - use 100 observations

In [18]:
# if True, use only 100 observations with 90-10 train-test-split for computational efficiency
use_small = False

In [19]:
if use_small:
    working_size = 100
else:
    working_size = all_features.shape[0]

## CHOOSE: way to do train-val-test splits

In [20]:
def make_split_generator(X, y, split_type='random', random_state=42, n_splits=1, test_size=0.1):
    assert split_type in ['random', 'stratified']

    if split_type == 'random':
        gen = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state).split(X)
    elif split_type == 'stratified':
        binned = y.apply(lambda x: int(x)) # creating stratified indices
        gen = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state).split(X, binned)
    
    # gen for generator
    return gen

### TrainVal-Test split: 90/10

In [21]:
# needed fot creating a dataframe of train_val features (reproduced for testing)
trainval_test_split_gen = make_split_generator(X=all_features.iloc[:working_size],
                                               y=labels[:working_size],
                                               split_type='random', random_state=42,
                                               n_splits=1, test_size=0.1)
#get numeric indexes
train_val, test = next(trainval_test_split_gen)
# get real indexes (i.e. Chembl id, substance name)
train_val, test = all_features.iloc[train_val].index, all_features.iloc[test].index
# we will use them later

print('TrainVal:')
print(train_val[:5], len(train_val))

print('\nTest:')
print(test[:5], len(test))

TrainVal:
Index(['cyclohexanamine', 'diphenyl ether', 'ethanol', '3-methylheptane',
       '111-trifluoropropan-2-ol'],
      dtype='object') 577

Test:
Index(['1-bromo-2-methyl-propane', '1,2,4-trichlorodibenzo-p-dioxin',
       '1-amino-9,10-anthracenedione', 'triethylphosphate',
       '2-(nitrooxy)ethan-1-ol'],
      dtype='object') 65


## CHOOSE: feature set to use for now, will itarate later

**This is only used for debugging**

In [22]:
# main feature set to use for now
f = '1024ecfp-4'
assert f in feature_sets

In [23]:
# train_val data frame
features = all_features.loc[train_val, feature_sets[f]]
#train_val targets
targets = labels.loc[train_val]

print(features.shape, targets.shape)

(577, 1024) (577,)


### Train-Val split: 80/10 (resulting in 80-10-10 in train-val-test)

**This is only used for debugging**

In [24]:
# needed fot creating a dataframe of train_val features (reproduced for testing)
train_val_split_gen = make_split_generator(X=features, y=targets,
                                           split_type='random', random_state=42,
                                           n_splits=1, test_size=1/9)
# get numeric indexes
train, val = next(train_val_split_gen)
# get real indexes (i.e. Chembl id, substance name)
train, val = all_features.iloc[train].index, all_features.iloc[val].index

print('Train:')
print(train[:5], len(train))

print('\nVal:')
print(val[:5], len(val))

Train:
Index(['octan-2-one',
       '1-N,1-N-diethyl-2,6-dinitro-4-(trifluoromethyl)benzene-1,3-diamine',
       'trimethoxymethylbenzene', 'fenuron', 'isobutyl nitrate'],
      dtype='object') 512

Val:
Index(['3-hydroxybenzonitrile', 'pyrrole', 'methyl 4-methoxybenzoate',
       '1-propylsulfanylpropane', 'octan-1-ol'],
      dtype='object') 65


# Training

## CHOOSE: metrics to use

See sklearn documentation 3.3.1.4. Using multiple metric evaluation:


In [25]:
def pearson_corr_coef(y_true, y_pred):
    """
    Original scipy.stats.pearsonr returns a tuple (r, p):
        r : float
            Pearson's correlation coefficient.  
        p-value : float
            Two-tailed p-value.
    """
    return pearsonr(y_true, y_pred)[0]

In [26]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [27]:
scoring = {
    'RMSE': make_scorer(rmse, greater_is_better=False),
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'R^2': make_scorer(r2_score, greater_is_better=True),
    'pearson_r': make_scorer(pearson_corr_coef, greater_is_better=True)
}

## CHOOSE: estimators to consider

### From the Paper (estimators):

- 3.5.3 **Kernel ridge regression.** Kernel ridge regression (KRR) is a combination of ridge regression and kernel trick. By using a nonlinear kernel function (**radial basis function**), it learns a non-linear function in the original space that maps features to predicted values.

- 3.5.4 **Random forests.** Random forests (RF) are ensemble prediction methods.(72) A random forest consists of many individual decision trees, each of which is trained on a subsampled version of the original dataset. The results for individual trees are averaged to provide output predictions for the full forest. Random forests can be used for both classification and regression tasks. Training a random forest can be computa- tionally intensive, so benchmarks only include random forest results for smaller datasets.

- 3.5.5 **Gradient boosting.** Gradient boosting is another ensemble method consisting of individual decision trees.(73) In contrast to random forests, it builds relatively simple trees which are sequentially incorporated to the ensemble. In each step, a new tree is generated in a greedy manner to minimize loss function. A sequence of such “weak” trees are combined together into an additive model. We utilize the XGBoost implementation of gradient boosting in DeepChem.(79)

In [28]:
estimators = {
    'rf':  RandomForestRegressor(),
    'xgb': XGBRegressor(),
    'krr': KernelRidge(kernel='rbf'), # 'rbf' used in the paper (defaults to 'linear')
    'gp': GaussianProcessRegressor(normalize_y=True), # normilize since we have not normalized here
}

## CHOOSE: hyperparameters to tune

### From the supplementary materials (hyperparameters):

1. **Model Training and Hyperparameter Optimization**
All models were trained on Stanford’s GPU clusters via DeepChem. No model was allowed to train for more than 10 hours(time profile in Table S1. Users can reproduce benchmarks locally by following directions from DeepChem.
Hyperparameters were determined using Gaussian Process Optimization via pyGPGO (https://github.com/hawk31/pyGPGO), with max number of iterations set to 20. Optimized hyperparameters for each model are listed, detailed hyperparameters
can be found on Deepchem.

    1.3 Kernel Ridge Regression (KRR)
        - Penalty parameter
    1.4 Random Forest (RF)
        - Number of trees in the forest: 500
    1.5 Gradient Boosting (XGBoost)
        - Maximum tree depth
        - Learning rate
        - Number of boosted tree

In [29]:
params = {
    'rf': {
        'rf__n_estimators': [500], # used in the paper
    },
    'xgb': {
        'xgb__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5], # from LFTL paper
        'xgb__max_depth': np.arange(1, 11, 2),
        'xgb__n_estimators': np.arange(50, 550, 50),
    },
    'krr': {
        'krr__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
    },
    'gp': {
        'gp__kernel': [RBF() + WhiteKernel(),
                       Matern() + WhiteKernel(),
                       DotProduct() + WhiteKernel()]
    }
}

## CHOOSE: number of iterations

**Set n_iter=20 to be consistent with GP hyperparameter search**

## Gridsearch loop

In [30]:
# %%time

best_params = {}
val_scores = {}
train_test_scores = {}

for f in feature_sets:
    print(f'Using {f} features...')

    # train_val data frame
    features = all_features.loc[train_val, feature_sets[f]]
    # train_val targets
    targets = labels.loc[train_val]

    best_params[f] = {}
    val_scores[f] = {}
    train_test_scores[f] = {}


    for e in estimators:
        print(f'\tRandom search optimisation for {e} estimator...')

        # leave like that so that parameter keys keep working
        pipe = Pipeline([(e, estimators[e])])
        
        # make train/val split generator
        train_val_split_gen = make_split_generator(X=features, y=targets,
                                                   split_type='random', random_state=42,
                                                   n_splits=1, test_size=1/9)

        # fit models and optimize paramerers
            # refit=False: .best_estimator_ is not available, .but best_params_ are
            # scoring='neg_mean_squared_error': equivalent to RMSE (might be faster to use builtin version)
        model = RandomizedSearchCV(pipe, param_distributions=params[e],
                                   cv=train_val_split_gen,
                                   scoring='neg_mean_squared_error',
                                   refit=True, # False: can's use .best_estimator_
                                   n_iter=20, n_jobs=-1,
                                   random_state=42).fit(features, targets)

        # record best model parameters
        best_params[f][e] = model.best_params_

        # make generator for 3 train/val splits
        train_val_split_gen = make_split_generator(X=features, y=targets,
                                                   split_type='random', random_state=42,
                                                   n_splits=3, test_size=1/9)


        # get metrics for the validation set
        val_results = cross_validate(estimator=model.best_estimator_,
                                     #estimator=pipe
                                     #fit_params=model.best_params_, 
                                     X=features, y=targets,
                                     cv=train_val_split_gen,
                                     scoring=scoring, n_jobs=-1)
        

        # record metrics (validation set) when fitting with best parameters
        val_scores[f][e] = val_results

        # make generator for 3 trainval/test splits
        trainval_test_split_gen = make_split_generator(X=all_features.iloc[:working_size],
                                                       y=labels[:working_size],
                                                       split_type='random', random_state=42,
                                                       n_splits=3, test_size=0.1)
        
        # get metrics for the train and test set
        #   make sure to restrict feature set here (don't want everything)
        test_results = cross_validate(estimator=model.best_estimator_,
                                      #estimator=pipe,
                                      #fit_params=model.best_params_,
                                      X=all_features[feature_sets[f]].iloc[:working_size],
                                      y=labels.iloc[:working_size],
                                      cv=trainval_test_split_gen,
                                      scoring=scoring, n_jobs=-1,
                                      return_train_score=True)
        train_test_scores[f][e] = test_results

%store best_params
%store val_scores
%store train_test_scores

Using 1024ecfp-4 features...
	Random search optimisation for rf estimator...
	Random search optimisation for xgb estimator...
	Random search optimisation for krr estimator...
	Random search optimisation for gp estimator...
Stored 'best_params' (dict)
Stored 'val_scores' (dict)
Stored 'train_test_scores' (dict)


In [31]:
# retrieve 
%store -r best_params
print(best_params)

{'1024ecfp-4': {'rf': {'rf__n_estimators': 500}, 'xgb': {'xgb__n_estimators': 100, 'xgb__max_depth': 7, 'xgb__learning_rate': 0.2}, 'krr': {'krr__alpha': 0.01}, 'gp': {'gp__kernel': Matern(length_scale=1, nu=1.5) + WhiteKernel(noise_level=1)}}}


In [32]:
# retrieve 
%store -r val_scores
print(val_scores)

{'1024ecfp-4': {'rf': {'fit_time': array([13.67025661, 13.5763967 ,  8.72875261]), 'score_time': array([0.06913567, 0.06881833, 0.04356933]), 'test_RMSE': array([-2.0652686 , -2.7198849 , -2.44871824]), 'test_MAE': array([-1.32842992, -1.21508427, -1.41746871]), 'test_R^2': array([0.68958705, 0.59216012, 0.61828794]), 'test_pearson_r': array([0.84292259, 0.76972331, 0.81527464])}, 'xgb': {'fit_time': array([3.76285124, 3.72412491, 2.4172976 ]), 'score_time': array([0.02843904, 0.02651   , 0.01561689]), 'test_RMSE': array([-1.99913261, -2.17552875, -2.06884979]), 'test_MAE': array([-1.34843504, -1.16502681, -1.30771738]), 'test_R^2': array([0.70914941, 0.73907344, 0.72753158]), 'test_pearson_r': array([0.84566209, 0.8647542 , 0.85898064])}, 'krr': {'fit_time': array([0.07818103, 0.10201812, 0.06083035]), 'score_time': array([0.01396918, 0.01514673, 0.00926852]), 'test_RMSE': array([-1.95879198, -2.31997877, -2.11971704]), 'test_MAE': array([-1.36544878, -1.2765197 , -1.27645511]), 'test

In [33]:
# retrieve 
%store -r train_test_scores
display(pd.DataFrame(pd.DataFrame(train_test_scores).loc['xgb' ,'1024ecfp-4']))

Unnamed: 0,fit_time,score_time,test_RMSE,train_RMSE,test_MAE,train_MAE,test_R^2,train_R^2,test_pearson_r,train_pearson_r
0,4.400016,0.027253,-2.512797,-0.395494,-1.512942,-0.277473,0.697992,0.988901,0.83628,0.994946
1,4.391989,0.027465,-2.087099,-0.438377,-1.212925,-0.305339,0.759531,0.986661,0.876977,0.993931
2,2.794128,0.015545,-1.708535,-0.435563,-1.126519,-0.309238,0.768243,0.987375,0.877118,0.994252


In [34]:
val_scores['1024ecfp-4']['rf']['test_RMSE'], type(val_scores['1024ecfp-4']['rf']['test_RMSE'])

(array([-2.0652686 , -2.7198849 , -2.44871824]), numpy.ndarray)

# CHOOSE: Names for saving results

## Into JSON files

In [35]:
# # JSON encoder for np.int64
# def default_params(o):
#     if isinstance(o, np.integer):
#         return int(o)
#     raise TypeError

In [36]:
# with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_best_params.json', 'w') as f:
#     json.dump(best_params, f, default=default_params)

In [37]:
# # JSON encoder for np.float64
# def default_scores(o):
#     if isinstance(o, np.ndarray):
#         return float(o)
#     raise TypeError

In [38]:
# with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_best_cv_scores.json', 'w') as f:
#     json.dump(cv_scores, f, default=default_scores)
    
# with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_best_test_score.json', 'w') as f:
#     json.dump(test_score, f, default=default_scores)

## Into Pickle files

In [39]:
with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_best_val_scores.pickle', 'wb') as fp:
    pickle.dump(val_scores, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_best_train_test_scores.pickle', 'wb') as fp:
    pickle.dump(train_test_scores, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_best_params.pickle', 'wb') as fp:
    pickle.dump(best_params, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'../results/{dataset}_{smile_type}_reproducing_random_search_grid_params.pickle', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
# with open(f'../results/{dataset}_{smile_type}_random_search_best_cv_scores.pickle', 'rb') as fp:
#     cv_scores = pickle.load(fp)

# with open(f'../results/{dataset}_{smile_type}_random_search_best_test_score.pickle', 'rb') as fp:
#     test_score = pickle.load(fp)

# with open(f'../results/{dataset}_{smile_type}_random_search_best_params.pickle', 'rb') as fp:
#     best_params = pickle.load(fp)

# with open(f'../results/{dataset}_{smile_type}_random_search_grid_params.pickle', 'rb') as fp:
#     params = pickle.load(fp)

In [41]:
# cv_scores

In [42]:
# test_score

In [43]:
# best_params

In [44]:
# params