# Setup (same everywhere)

## Mount Drive


In [6]:
from google.colab import drive
# drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [1]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [2]:
%cd drive/'My Drive'/repositories/moleculenet/notebooks

/content/drive/My Drive/repositories/moleculenet/notebooks


In [3]:
%ls

colab_extended_GPflow_pipeline.ipynb
colab_extended_GPflow_regression.ipynb
colab_extended_grid_search_pipeline.ipynb
colab_reproducing_ci_gp.ipynb
colab_reproducing_ci_rf.ipynb
colab_reproducing_grid_search_pipeline.ipynb
eda.ipynb
visualise_extended_grid_search_results.ipynb
visualise_reproducing_grid_search_results.ipynb


In [4]:
%ls ../data/

esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original.csv
esol_original_extra_features.csv
esol_original_IdSmilesLabels.csv
esol_original_rdkit_features.csv
ESOL_README
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
FreeSolv_README
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv
Lipo_README


## Import modules

### Standard imports

In [5]:
import warnings
warnings.filterwarnings('ignore')

# custom imports
import os
import sys

# saving models
import json
import pickle

# standard modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# metrics
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import jaccard_score # Tanimoto

# making custom metrics
from sklearn.metrics import make_scorer

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

# preprocessing
from sklearn.feature_selection import VarianceThreshold # to remove zero-var features
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.neural_network import MLPRegressor

from sklearn.kernel_ridge import KernelRidge

# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel
# from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct

# pipelines
# https://scikit-learn.org/stable/modules/compose.html#combining-estimators
from sklearn.pipeline import make_pipeline, Pipeline

In [6]:
try:
    import gpflow
    print('Successful import')
except ModuleNotFoundError:
    !pip install gpflow
    import gpflow
    print('Successful import')

import tensorflow as tf

from gpflow.mean_functions import Constant
from gpflow.utilities import positive, print_summary
from gpflow.utilities.ops import broadcasting_elementwise

Successful import


### Custom imports

In [7]:
sys.path.insert(0, '..')

# global vars
from util_scripts.plotting_functions_and_vars import datasets_to_titles, datasets_to_units, metrics_to_labels

sys.path.insert(0, './notebooks')

## Set plotting style

In [9]:
%matplotlib inline
plt.style.use('fivethirtyeight')

plt.rcParams['axes.facecolor']='w'
#plt.rcParams['axes.linewidth']=1
plt.rcParams['axes.edgecolor']='w'
plt.rcParams['figure.facecolor']='w'
plt.rcParams['savefig.facecolor']='w'
#plt.rcParams['grid.color']='white'

# Load Data (same everywhere)

## CHOOSE: dataset, smile_type, grid_search_type, haw we deal with highly correlated features

In [201]:
dataset = 'lipophilicity'
assert dataset in ['freesolv', 'esol', 'lipophilicity']

smile_type = 'original'
assert smile_type in ['original', 'protonated']

grid_search_type = 'extended'
assert grid_search_type in ['reproducing', 'extended']

## Load Features and Targets

Leave all features here so setup and Load and prepare data are the same everywhere.

In [202]:
# original data
id_smile_target = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmilesLabels.csv', index_col=0)
# labels
labels = id_smile_target['labels']

In [203]:
# fingerprints
ecfp4_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp4_features.csv', index_col=0)
ecfp6_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp6_features.csv', index_col=0)

ecfp4_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp4_features.csv', index_col=0)
ecfp6_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp6_features.csv', index_col=0)

In [204]:
print('ecfp4_1024_features.shape: ', ecfp4_1024_features.shape)
print('ecfp6_1024_features.shape: ', ecfp6_1024_features.shape)
print('ecfp4_2048_features.shape: ', ecfp4_2048_features.shape)
print('ecfp6_2048_features.shape: ', ecfp6_2048_features.shape)
print('labels.shape:              ', labels.shape)

ecfp4_1024_features.shape:  (4200, 1024)
ecfp6_1024_features.shape:  (4200, 1024)
ecfp4_2048_features.shape:  (4200, 2048)
ecfp6_2048_features.shape:  (4200, 2048)
labels.shape:               (4200,)


In [205]:
labels.head()

id
CHEMBL596271     3.54
CHEMBL1951080   -1.18
CHEMBL1771       3.69
CHEMBL234951     3.37
CHEMBL565079     3.10
Name: labels, dtype: float64

## Create one DataFrame with all features

In [206]:
all_features = pd.concat([ecfp4_1024_features, ecfp6_1024_features,
                          ecfp4_2048_features, ecfp6_2048_features],
                         axis='columns')

In [207]:
all_features.shape

(4200, 6144)

In [208]:
all_features.head()

Unnamed: 0,1024ecfp4-0,1024ecfp4-1,1024ecfp4-2,1024ecfp4-3,1024ecfp4-4,1024ecfp4-5,1024ecfp4-6,1024ecfp4-7,1024ecfp4-8,1024ecfp4-9,1024ecfp4-10,1024ecfp4-11,1024ecfp4-12,1024ecfp4-13,1024ecfp4-14,1024ecfp4-15,1024ecfp4-16,1024ecfp4-17,1024ecfp4-18,1024ecfp4-19,1024ecfp4-20,1024ecfp4-21,1024ecfp4-22,1024ecfp4-23,1024ecfp4-24,1024ecfp4-25,1024ecfp4-26,1024ecfp4-27,1024ecfp4-28,1024ecfp4-29,1024ecfp4-30,1024ecfp4-31,1024ecfp4-32,1024ecfp4-33,1024ecfp4-34,1024ecfp4-35,1024ecfp4-36,1024ecfp4-37,1024ecfp4-38,1024ecfp4-39,...,2048ecfp6-2008,2048ecfp6-2009,2048ecfp6-2010,2048ecfp6-2011,2048ecfp6-2012,2048ecfp6-2013,2048ecfp6-2014,2048ecfp6-2015,2048ecfp6-2016,2048ecfp6-2017,2048ecfp6-2018,2048ecfp6-2019,2048ecfp6-2020,2048ecfp6-2021,2048ecfp6-2022,2048ecfp6-2023,2048ecfp6-2024,2048ecfp6-2025,2048ecfp6-2026,2048ecfp6-2027,2048ecfp6-2028,2048ecfp6-2029,2048ecfp6-2030,2048ecfp6-2031,2048ecfp6-2032,2048ecfp6-2033,2048ecfp6-2034,2048ecfp6-2035,2048ecfp6-2036,2048ecfp6-2037,2048ecfp6-2038,2048ecfp6-2039,2048ecfp6-2040,2048ecfp6-2041,2048ecfp6-2042,2048ecfp6-2043,2048ecfp6-2044,2048ecfp6-2045,2048ecfp6-2046,2048ecfp6-2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# CHOOSE: what feature_sets we will iterate over

## Put original features into feature_sets
Create a dictionary containing feature names (index objects) as its elements

In [209]:
feature_sets = {
    '1024ecfp-4': ecfp4_1024_features.columns,
    '1024ecfp-6': ecfp6_1024_features.columns,
    '2048ecfp-4': ecfp4_2048_features.columns,
    '2048ecfp-6': ecfp6_2048_features.columns,
}

# Train-Validation-Test split

## CHOOSE: use_small - use 100 observations

In [210]:
# if True, use only 100 observations with 90-10 train-test-split for computational efficiency
use_small = False

In [211]:
if use_small:
    working_size = 100
else:
    working_size = all_features.shape[0]

## CHOOSE: way to do train-val-test splits

In [212]:
def make_split_generator(X, y, split_type='random', random_state=42, n_splits=1, test_size=0.1):
    assert split_type in ['random', 'stratified']

    if split_type == 'random':
        gen = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state).split(X)
    elif split_type == 'stratified':
        binned = y.apply(lambda x: int(x)) # creating stratified indices
        gen = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state).split(X, binned)
    
    # gen for generator
    return gen

### TrainVal-Test split: 90/10

In [213]:
# needed fot creating a dataframe of train_val features (reproduced for testing)
trainval_test_split_gen = make_split_generator(X=all_features.iloc[:working_size],
                                               y=labels[:working_size],
                                               split_type='random', random_state=42,
                                               n_splits=1, test_size=0.1)
#get numeric indexes
train_val, test = next(trainval_test_split_gen)
# get real indexes (i.e. Chembl id, substance name)
train_val, test = all_features.iloc[train_val].index, all_features.iloc[test].index
# we will use them later

print('TrainVal:')
print(train_val[:5], len(train_val))

print('\nTest:')
print(test[:5], len(test))

TrainVal:
Index(['CHEMBL2325714', 'CHEMBL256985', 'CHEMBL298384', 'CHEMBL205807',
       'CHEMBL1652621'],
      dtype='object') 3780

Test:
Index(['CHEMBL1431112', 'CHEMBL1322675', 'CHEMBL2030964', 'CHEMBL1381989',
       'CHEMBL74582'],
      dtype='object') 420


## CHOOSE: feature set to use for now, will itarate later

**This is only used for debugging**

In [214]:
# main feature set to use for now
f = '2048ecfp-4'
assert f in feature_sets

In [215]:
# train_val data frame
features = all_features.loc[train_val, feature_sets[f]]
#train_val targets
targets = labels.loc[train_val]

print(features.shape, targets.shape)

(3780, 2048) (3780,)


### Train-Val split: 80/10 (resulting in 80-10-10 in train-val-test)

**This is only used for debugging**

In [216]:
# needed fot creating a dataframe of train_val features (reproduced for testing)
train_val_split_gen = make_split_generator(X=features, y=targets,
                                           split_type='random', random_state=42,
                                           n_splits=1, test_size=1/9)
# get numeric indexes
train, val = next(train_val_split_gen)
# get real indexes (i.e. Chembl id, substance name)
train, val = all_features.iloc[train].index, all_features.iloc[val].index

print('Train:')
print(train[:5], len(train))

print('\nVal:')
print(val[:5], len(val))

Train:
Index(['CHEMBL2035039', 'CHEMBL51776', 'CHEMBL429682', 'CHEMBL452',
       'CHEMBL1822878'],
      dtype='object') 3360

Val:
Index(['CHEMBL460', 'CHEMBL1689118', 'CHEMBL20210', 'CHEMBL320882',
       'CHEMBL1223955'],
      dtype='object') 420


# Training

## Tanimoto kernel

In [217]:
class Tanimoto(gpflow.kernels.Kernel):
    def __init__(self):
        super().__init__()
        # We constrain the value of the kernel variance to be positive when it's being optimised
        self.variance = gpflow.Parameter(1.0, transform=positive())

    def K(self, X, X2=None):
        """
        Compute the Tanimoto kernel matrix σ² * ((<x, y>) / (||x||^2 + ||y||^2 - <x, y>))

        :param X: N x D array
        :param X2: M x D array. If None, compute the N x N kernel matrix for X.
        :return: The kernel matrix of dimension N x M
        """
        if X2 is None:
            X2 = X

        Xs = tf.reduce_sum(tf.square(X), axis=-1)  # Squared L2-norm of X
        X2s = tf.reduce_sum(tf.square(X2), axis=-1)  # Squared L2-norm of X2
        outer_product = tf.tensordot(X, X2, [[-1], [-1]])  # outer product of the matrices X and X2

        # Analogue of denominator in Tanimoto formula

        denominator = -outer_product + broadcasting_elementwise(tf.add, Xs, X2s)

        return self.variance * outer_product/denominator

    def K_diag(self, X):
        """
        Compute the diagonal of the N x N kernel matrix of X
        :param X: N x D array
        :return: N x 1 array
        """
        return tf.fill(tf.shape(X)[:-1], tf.squeeze(self.variance))

In [218]:
def transform_data(X_train, y_train, X_test, y_test):
    """
    Apply feature scaling, dimensionality reduction to the data. Return the standardised and low-dimensional train and
    test sets together with the scaler object for the target values.

    :param X_train: input train data
    :param y_train: train labels
    :param X_test: input test data
    :param y_test: test labels
    :return: X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler
    """

    x_scaler = StandardScaler()
    X_train_scaled = x_scaler.fit_transform(X_train)
    X_test_scaled = x_scaler.transform(X_test)
    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train)
    y_test_scaled = y_scaler.transform(y_test)

    return X_train_scaled, y_train_scaled, X_test_scaled, y_test_scaled, y_scaler

In [219]:
def fit_tanimoto_gpr(X_train, y_train, X_test, y_test, results):

    # We define the Gaussian Process Regression Model using the Tanimoto kernel

    m = None
    def objective_closure():
        return -m.log_marginal_likelihood()

    # ----

    # reshape and convert to numpy
    y_train = y_train.values.reshape(-1, 1)
    y_test = y_test.values.reshape(-1, 1)

    #  We standardise the outputs but leave the inputs unchanged
    _, y_train, _, y_test, y_scaler = transform_data(X_train, y_train, X_test, y_test)

    # to_numpy()
    X_train = X_train.values.astype(np.float64)
    X_test = X_test.values.astype(np.float64)

    # kernel
    k = Tanimoto()
    m = gpflow.models.GPR(data=(X_train, y_train), mean_function=Constant(np.mean(y_train)), kernel=k, noise_variance=1)

    # Optimise the kernel variance and noise level by the marginal likelihood

    opt = gpflow.optimizers.Scipy()
    opt.minimize(objective_closure, m.trainable_variables, options=dict(maxiter=100))
    print_summary(m)

    # ----

    # mean and variance GP prediction
    #
    # train set
    y_train_pred, y_train_var = m.predict_f(X_train)
    y_train_pred = y_scaler.inverse_transform(y_train_pred)
    y_train = y_scaler.inverse_transform(y_train)
    # test set
    y_test_pred, y_test_var = m.predict_f(X_test)
    y_test_pred = y_scaler.inverse_transform(y_test_pred)
    y_test = y_scaler.inverse_transform(y_test)

    # ----

    # Train set metrics: compute
    score = r2_score(y_train, y_train_pred)
    rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    mae = mean_absolute_error(y_train, y_train_pred)
    pearson_r = pearsonr(y_train.squeeze(), y_train_pred.squeeze())[0]

    # Train set metrics: record
    results['train_R^2'].append(score)
    results['train_RMSE'].append(rmse)
    results['train_MAE'].append(mae)
    results['train_pearson_r'].append(pearson_r)
    
    ## Train set metrics: print
    # print("\nTrain R^2: {:.3f}".format(score))
    print("Train RMSE: {:.3f} {}".format(rmse, datasets_to_units[dataset]))
    # print("Train MAE: {:.3f} {}".format(mae, datasets_to_units[dataset]))
    # print("Train Pearson r: {:.3f}".format(pearson_r))

    # ----

    # Test set metrics: compute
    score = r2_score(y_test, y_test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae = mean_absolute_error(y_test, y_test_pred)
    pearson_r = pearsonr(y_test.squeeze(), y_test_pred.squeeze())[0]

    # Test set metrics: record
    results['test_R^2'].append(score)
    results['test_RMSE'].append(rmse)
    results['test_MAE'].append(mae)
    results['test_pearson_r'].append(pearson_r)

    ## Test set metrics: print
    # print("\nTest R^2: {:.3f}".format(score))
    print("Test RMSE: {:.3f} {}".format(rmse, datasets_to_units[dataset]))
    # print("Test MAE: {:.3f} {}".format(mae, datasets_to_units[dataset]))
    # print("Test Pearson r: {:.3f}".format(pearson_r))

    return results


## Training/validation loop

In [220]:
# generator to get 90% of the indexes
trainval_test_split_gen = make_split_generator(X=all_features.iloc[:working_size],
                                               y=labels[:working_size],
                                               split_type='random', random_state=42,
                                               n_splits=1, test_size=0.1)
# get numeric indexes
train_val, test = next(trainval_test_split_gen)
# get real indexes (i.e. Chembl id, substance name)
train_val, test = all_features.iloc[train_val].index, all_features.iloc[test].index

In [221]:
%%time

for f in feature_sets:
    print(f'{f} features...')

    # create dataframes of only the features and observarions we are looking at now without na-s
    #
    # train_val data frame (90% of the data), train_val left from before
    features = all_features.loc[train_val, feature_sets[f]].dropna(axis=0)
    # train_val targets
    targets = labels.loc[features.index]


    # we check for train-test scores because they are computed last for each estimator
    if f not in train_test_scores:
        val_scores[f] = {}
        train_test_scores[f] = {}

    # -----------------------------------------------------------------
    print(f'\t\t Computing validation scores')

    # make generator for 3 train/val splits
    train_val_split_gen = make_split_generator(X=features, y=targets,
                                               split_type='random', random_state=42,
                                               n_splits=3, test_size=1/9)
    # leave the same for validation and test scores
    results = {
        'test_MAE':  [], 'test_RMSE':  [], 'test_R^2':  [], 'test_pearson_r':  [],
        'train_MAE': [], 'train_RMSE': [], 'train_R^2': [], 'train_pearson_r': []
    }


    # iterate over the train_val_split_gen to get 3 splits and metrics for them
    for train, val in train_val_split_gen:
        # get real indexes (i.e. Chembl id, substance name)
        train, val = all_features.iloc[train].index, all_features.iloc[val].index

        # train-val split
        X_train = all_features.loc[train, feature_sets[f]].dropna(axis=0)
        y_train = labels.loc[X_train.index]
        X_test = all_features.loc[val, feature_sets[f]].dropna(axis=0)
        y_test = labels.loc[X_test.index]

        # leave the same for validation and test scores
        fit_tanimoto_gpr(X_train, y_train, X_test, y_test, results)

    
    # record metrics after multiple iteratios on validation set
    val_scores[f] = results
    #%store val_scores

    # save validation scores
    with open(f'../results/{dataset}_{smile_type}_{grid_search_type}_gp_tanimoto_val_scores.pickle', 'wb') as fp:
        pickle.dump(val_scores, fp, protocol=pickle.HIGHEST_PROTOCOL)


    # -----------------------------------------------------------------
    print(f'\t\t Computing train and test scores')

    # create dataframes of only the features and observarions we are looking at now without na-s
    all_interesting_features = all_features[feature_sets[f]].iloc[:working_size].dropna(axis=0)
    all_interesting_targets = labels.loc[all_interesting_features.index]

    # make generator for 3 trainval/test splits
    trainval_test_split_gen = make_split_generator(X=all_interesting_features,
                                                   y=all_interesting_targets,
                                                   split_type='random', random_state=42,
                                                   n_splits=3, test_size=0.1)
    # leave the same for validation and test scores
    results = {
        'test_MAE':  [], 'test_RMSE':  [], 'test_R^2':  [], 'test_pearson_r':  [],
        'train_MAE': [], 'train_RMSE': [], 'train_R^2': [], 'train_pearson_r': []
    }


    # iterate over the train_val_split_gen to get 3 splits and metrics for them
    for train_val, test in trainval_test_split_gen:
        # get real indexes (i.e. Chembl id, substance name)
        train_val, test = all_features.iloc[train_val].index, all_features.iloc[test].index

        # trainval-test split
        X_train = all_features.loc[train_val, feature_sets[f]].dropna(axis=0)
        y_train = labels.loc[X_train.index]
        X_test = all_features.loc[test, feature_sets[f]].dropna(axis=0)
        y_test = labels.loc[X_test.index]

        # leave the same for validation and test scores
        fit_tanimoto_gpr(X_train, y_train, X_test, y_test, results)

    train_test_scores[f] = results

    # save train test scores
    with open(f'../results/{dataset}_{smile_type}_{grid_search_type}_gp_tanimoto_train_test_scores.pickle', 'wb') as fp:
        pickle.dump(train_test_scores, fp, protocol=pickle.HIGHEST_PROTOCOL)

1024ecfp-4 features...
		 Computing validation scores
╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤════════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │      value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪════════════╡
│ GPR.mean_function.c     │ Parameter │ Identity         │         │ True        │ ()      │ float64 │ -0.355989  │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼────────────┤
│ GPR.kernel.variance     │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │  1.25372   │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼────────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ True        │ ()      │ float64 │  0.0148233 │
╘═════════════════

In [222]:
# sanity check
train_test_scores

{'1024ecfp-4': {'test_MAE': [0.5025881193623053,
   0.490940972889915,
   0.5971520957921008],
  'test_RMSE': [0.700546147133206, 0.6668768885448391, 0.8054344632115541],
  'test_R^2': [0.649072015433638, 0.6376330229471974, 0.5719536585529877],
  'test_pearson_r': [0.8070281867192061,
   0.7990346004461988,
   0.7588251054550114],
  'train_MAE': [0.040402005224676935,
   0.05071944624017948,
   0.04178398693586923],
  'train_RMSE': [0.05815619314374728,
   0.07267750088482147,
   0.06121479449397663],
  'train_R^2': [0.9976695341001454, 0.9964046552677486, 0.9973950592492321],
  'train_pearson_r': [0.9990799107244046,
   0.9985930699005947,
   0.9989605544321083]},
 '1024ecfp-6': {'test_MAE': [0.5460286986971107,
   0.5227285587834318,
   0.6369857003882723],
  'test_RMSE': [0.7316091114806973, 0.6912379303870037, 0.8508937817940948],
  'test_R^2': [0.6172610020322158, 0.610674901504737, 0.5222715863096794],
  'test_pearson_r': [0.7864909580182584,
   0.7833932643025207,
   0.72468876

In [223]:
val_scores

{'1024ecfp-4': {'test_MAE': [0.5668781024700458,
   0.5344541916953804,
   0.5572877708198974],
  'test_RMSE': [0.7509776358368999, 0.713528658748328, 0.7471911088109986],
  'test_R^2': [0.6077137649200166, 0.6563374528393302, 0.5893583650725728],
  'test_pearson_r': [0.7804025475978751, 0.810818130660305, 0.767945856232835],
  'train_MAE': [0.025562144430317835,
   0.04010384602761006,
   0.04305480296343022],
  'train_RMSE': [0.037992398762384284,
   0.05820683110808082,
   0.062122383443579114],
  'train_R^2': [0.9990020770421465, 0.9976502852529789, 0.9973494656264063],
  'train_pearson_r': [0.9995993476941151,
   0.9990911345277582,
   0.9989737909787453]},
 '1024ecfp-6': {'test_MAE': [0.6088831016026847,
   0.5592072837081095,
   0.6074522920581641],
  'test_RMSE': [0.7890085632075547, 0.7327368363046339, 0.8198129848484439],
  'test_R^2': [0.566975473912168, 0.6375856275128645, 0.5056561050414015],
  'test_pearson_r': [0.7540276621982204,
   0.799663199421984,
   0.7133965409654

# CHOOSE: Names for saving results

## From Pickle files

In [224]:
with open(f'../results/{dataset}_{smile_type}_{grid_search_type}_gp_tanimoto_val_scores.pickle', 'rb') as fp:
    val_scores = pickle.load(fp)

with open(f'../results/{dataset}_{smile_type}_{grid_search_type}_gp_tanimoto_train_test_scores.pickle', 'rb') as fp:
    train_test_scores = pickle.load(fp)

In [225]:
val_scores

{'1024ecfp-4': {'test_MAE': [0.5668781024700458,
   0.5344541916953804,
   0.5572877708198974],
  'test_RMSE': [0.7509776358368999, 0.713528658748328, 0.7471911088109986],
  'test_R^2': [0.6077137649200166, 0.6563374528393302, 0.5893583650725728],
  'test_pearson_r': [0.7804025475978751, 0.810818130660305, 0.767945856232835],
  'train_MAE': [0.025562144430317835,
   0.04010384602761006,
   0.04305480296343022],
  'train_RMSE': [0.037992398762384284,
   0.05820683110808082,
   0.062122383443579114],
  'train_R^2': [0.9990020770421465, 0.9976502852529789, 0.9973494656264063],
  'train_pearson_r': [0.9995993476941151,
   0.9990911345277582,
   0.9989737909787453]},
 '1024ecfp-6': {'test_MAE': [0.6088831016026847,
   0.5592072837081095,
   0.6074522920581641],
  'test_RMSE': [0.7890085632075547, 0.7327368363046339, 0.8198129848484439],
  'test_R^2': [0.566975473912168, 0.6375856275128645, 0.5056561050414015],
  'test_pearson_r': [0.7540276621982204,
   0.799663199421984,
   0.7133965409654

In [226]:
train_test_scores

{'1024ecfp-4': {'test_MAE': [0.5025881193623053,
   0.490940972889915,
   0.5971520957921008],
  'test_RMSE': [0.700546147133206, 0.6668768885448391, 0.8054344632115541],
  'test_R^2': [0.649072015433638, 0.6376330229471974, 0.5719536585529877],
  'test_pearson_r': [0.8070281867192061,
   0.7990346004461988,
   0.7588251054550114],
  'train_MAE': [0.040402005224676935,
   0.05071944624017948,
   0.04178398693586923],
  'train_RMSE': [0.05815619314374728,
   0.07267750088482147,
   0.06121479449397663],
  'train_R^2': [0.9976695341001454, 0.9964046552677486, 0.9973950592492321],
  'train_pearson_r': [0.9990799107244046,
   0.9985930699005947,
   0.9989605544321083]},
 '1024ecfp-6': {'test_MAE': [0.5460286986971107,
   0.5227285587834318,
   0.6369857003882723],
  'test_RMSE': [0.7316091114806973, 0.6912379303870037, 0.8508937817940948],
  'test_R^2': [0.6172610020322158, 0.610674901504737, 0.5222715863096794],
  'test_pearson_r': [0.7864909580182584,
   0.7833932643025207,
   0.72468876