# Setup (local)

In [1]:
%ls

colab_RF_CIs_on_fingerprints_initial.ipynb
colab_extended_grid_search_pipeline.ipynb
colab_reproducing_grid_search_pipeline.ipynb
get_protenated_from_canonical.ipynb
necessary_eda.ipynb
visualise_extended_grid_search_results.ipynb
visualise_grid_search_results.ipynb
visualise_reproducing_grid_search_results.ipynb


In [2]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original_IdSmilesLabels.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv


## Import modules

In [19]:
import warnings
warnings.filterwarnings('ignore')

# saving models
import json
import pickle

# directory
import os
import sys

# standard modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# metrics
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import jaccard_score # Tanimoto

# making custom metrics
from sklearn.metrics import make_scorer

# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

# preprocessing
from sklearn.feature_selection import VarianceThreshold # to remove zero-var features
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.neural_network import MLPRegressor

from sklearn.kernel_ridge import KernelRidge

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel
from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct

# pipelines
# https://scikit-learn.org/stable/modules/compose.html#combining-estimators
from sklearn.pipeline import make_pipeline, Pipeline

In [4]:
import sklearn
print(sklearn.__version__)

0.23.1


## Set plotting style

In [5]:
%matplotlib inline
plt.style.use('fivethirtyeight')

plt.rcParams['axes.facecolor']='w'
#plt.rcParams['axes.linewidth']=1
plt.rcParams['axes.edgecolor']='w'
plt.rcParams['figure.facecolor']='w'
plt.rcParams['savefig.facecolor']='w'
#plt.rcParams['grid.color']='white'

# Load Data (same everywhere)

## CHOOSE: dataset, smile_type

In [6]:
dataset = 'lipophilicity'
smile_type = 'original'

# READONLY

# small to large datasets
assert dataset in ['freesolv', 'esol', 'lipophilicity']
assert smile_type in ['original', 'protonated']

## Load Features and Targets

Leave all features here so setup and Load and prepare data are the same everywhere.

In [7]:
# original data
id_smile_target = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmilesLabels.csv', index_col=0)

# features
rdkit_features = pd.read_csv(f'../data/{dataset}_{smile_type}_rdkit_features.csv', index_col=0)

ecfp4_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp4_features.csv', index_col=0)
ecfp6_1024_features = pd.read_csv(f'../data/{dataset}_{smile_type}_1024ecfp6_features.csv', index_col=0)

ecfp4_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp4_features.csv', index_col=0)
ecfp6_2048_features = pd.read_csv(f'../data/{dataset}_{smile_type}_2048ecfp6_features.csv', index_col=0)

# load target
labels = id_smile_target['labels']

In [8]:
print('rdkit_features.shape:      ', rdkit_features.shape)
print('ecfp4_1024_features.shape: ', ecfp4_1024_features.shape)
print('ecfp6_1024_features.shape: ', ecfp6_1024_features.shape)
print('ecfp4_2048_features.shape: ', ecfp4_2048_features.shape)
print('ecfp6_2048_features.shape: ', ecfp6_2048_features.shape)
print('labels.shape:              ', labels.shape)

rdkit_features.shape:       (4200, 200)
ecfp4_1024_features.shape:  (4200, 1024)
ecfp6_1024_features.shape:  (4200, 1024)
ecfp4_2048_features.shape:  (4200, 2048)
ecfp6_2048_features.shape:  (4200, 2048)
labels.shape:               (4200,)


In [9]:
labels.head()

id
CHEMBL596271     3.54
CHEMBL1951080   -1.18
CHEMBL1771       3.69
CHEMBL234951     3.37
CHEMBL565079     3.10
Name: labels, dtype: float64

## Create one DataFrame with all features

In [10]:
all_features = pd.concat([rdkit_features,
                          ecfp4_1024_features, ecfp6_1024_features,
                          ecfp4_2048_features, ecfp6_2048_features],
                         axis='columns')

In [11]:
all_features.shape

(4200, 6344)

In [12]:
all_features.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,2048ecfp6-2038,2048ecfp6-2039,2048ecfp6-2040,2048ecfp6-2041,2048ecfp6-2042,2048ecfp6-2043,2048ecfp6-2044,2048ecfp6-2045,2048ecfp6-2046,2048ecfp6-2047
CHEMBL596271,8.838871,-4.082382,8.838871,0.008322,0.728444,340.858,319.69,340.145474,124.0,0.0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,15.117958,-6.843264,15.117958,0.920611,0.545587,494.591,468.383,494.118143,178.0,0.0,...,0,0,0,0,0,0,0,1,0,0
CHEMBL1771,13.321227,-3.906276,13.321227,0.250582,0.807761,321.829,305.701,321.059027,110.0,0.0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,14.213604,-4.272077,14.213604,0.1874,0.50665,419.89,401.746,419.070655,146.0,0.0,...,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,14.167882,-4.810274,14.167882,0.671279,0.747686,381.48,354.264,381.216475,148.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Create feature_sets dictionary with pd.Index objects (containing feature names) as its elements

In [13]:
feature_sets = {
    'RDKit': rdkit_features.columns,
    '1024ecfp-4': ecfp4_1024_features.columns,
    '1024ecfp-6': ecfp6_1024_features.columns,
    '2048ecfp-4': ecfp4_2048_features.columns,
    '2048ecfp-6': ecfp6_2048_features.columns,
}

# Get number of zero-variance columns in unsplit (train-val-test) datasets

In [14]:
transform = VarianceThreshold()

In [15]:
featuresets_to_reductions = {}

for f in feature_sets:
    
    df = all_features[feature_sets[f]]
    print(f'{f} feature set originally has {df.shape[1]} columns')
    new_df = transform.fit_transform(X=df)
    print(f'{f} feature set without zero-var columns has {new_df.shape[1]} columns')
    print(f'Reduction: {df.shape[1] - new_df.shape[1]}\n')
    
    featuresets_to_reductions[f] = {
        'Original features': df.shape[1],
        'Reduced features': new_df.shape[1],
        'Reduction': df.shape[1] - new_df.shape[1]
    }
    
featuresets_to_reductions_df = pd.DataFrame(featuresets_to_reductions).T
display(featuresets_to_reductions_df)

RDKit feature set originally has 200 columns
RDKit feature set without zero-var columns has 188 columns
Reduction: 12

1024ecfp-4 feature set originally has 1024 columns
1024ecfp-4 feature set without zero-var columns has 1024 columns
Reduction: 0

1024ecfp-6 feature set originally has 1024 columns
1024ecfp-6 feature set without zero-var columns has 1024 columns
Reduction: 0

2048ecfp-4 feature set originally has 2048 columns
2048ecfp-4 feature set without zero-var columns has 2047 columns
Reduction: 1

2048ecfp-6 feature set originally has 2048 columns
2048ecfp-6 feature set without zero-var columns has 2048 columns
Reduction: 0



Unnamed: 0,Original features,Reduced features,Reduction
RDKit,200,188,12
1024ecfp-4,1024,1024,0
1024ecfp-6,1024,1024,0
2048ecfp-4,2048,2047,1
2048ecfp-6,2048,2048,0


In [16]:
featuresets_to_reductions_df.to_csv(f'../tables/{dataset}_{smile_type}_featuresets_to_zero_varaince_reductions.csv')

In [27]:
pd.read_csv(f'../tables/{dataset}_{smile_type}_featuresets_to_zero_varaince_reductions.csv', index_col=0)

Unnamed: 0,Original features,Reduced features,Reduction
RDKit,200,188,12
1024ecfp-4,1024,1024,0
1024ecfp-6,1024,1024,0
2048ecfp-4,2048,2047,1
2048ecfp-6,2048,2048,0


## Visuzlizing reductions

In [32]:
if np.array([f'{dataset}_original_featuresets_to_zero_varaince_reductions.csv' in os.listdir('../tables') for dataset in ['freesolv', 'esol', 'lipophilicity']]).all():
    esol = pd.read_csv(f'../tables/esol_original_featuresets_to_zero_varaince_reductions.csv', index_col=0)
    freesolv = pd.read_csv(f'../tables/freesolv_original_featuresets_to_zero_varaince_reductions.csv', index_col=0)
    lipophilicity = pd.read_csv(f'../tables/lipophilicity_original_featuresets_to_zero_varaince_reductions.csv', index_col=0)

In [38]:
# 642 rows
display(freesolv)

Unnamed: 0,Original features,Reduced features,Reduction
RDKit,200,167,33
1024ecfp-4,1024,849,175
1024ecfp-6,1024,981,43
2048ecfp-4,2048,1224,824
2048ecfp-6,2048,1617,431


In [39]:
# 1128 rows
display(esol)

Unnamed: 0,Original features,Reduced features,Reduction
RDKit,200,180,20
1024ecfp-4,1024,1013,11
1024ecfp-6,1024,1024,0
2048ecfp-4,2048,1856,192
2048ecfp-6,2048,2030,18


In [40]:
# 4200 rows
display(lipophilicity)

Unnamed: 0,Original features,Reduced features,Reduction
RDKit,200,188,12
1024ecfp-4,1024,1024,0
1024ecfp-6,1024,1024,0
2048ecfp-4,2048,2047,1
2048ecfp-6,2048,2048,0
