In [1]:
%ls

colab_rf_grid_search_on_fingerprints.ipynb
get_all_fingerprints_from_all_smiles.ipynb
get_fingerprints_lipophilicity_original.ipynb
get_original_id_smile_target_esol.ipynb
get_original_id_smile_target_freeesolv.ipynb
get_original_id_smile_target_lipophilicity.ipynb
get_protenated_from_canonical.ipynb
get_rdkit_descriptors_freesolv_original.ipynb
modelling_pipeline.ipynb


In [2]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
freesolv_original_ecfp4_features.csv
freesolv_original_ecfp6_features.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv
lipophilicity_original_ecfp4_features.csv
lipophilicity_original_ecfp6_features.csv
lipophilicity_original_rdkit_features.csv
lipophilicity_original_smiles.smi
lipophilicity_original_smiles20.smi
lipophilicity_protonated_smiles20_messy.txt


In [55]:
dataset = 'lipophilicity'
smile_type = 'original'

In [56]:
# READ_ONLY
assert dataset in ['esol', 'freesolv', 'lipophilicity']
assert smile_type in ['original', 'protonated']

# Import modules

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pathlib

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

# Load Data

In [57]:
data = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmileTarget.csv', index_col=0)
print(data.head(), '\n')
data.shape

                                                           smile  target
id                                                                      
CHEMBL596271             Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14    3.54
CHEMBL1951080  COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...   -1.18
CHEMBL1771                COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl    3.69
CHEMBL234951   OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...    3.37
CHEMBL565079   Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...    3.10 



(4200, 2)

In [58]:
smiles = data['smile']
print(len(smiles))

smiles[:5]

4200


id
CHEMBL596271               Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080    COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
CHEMBL1771                  COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951     OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
CHEMBL565079     Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
Name: smile, dtype: object

# Create molecules from smiles

In [59]:
%%time

# record molecules
ms = [Chem.MolFromSmiles(smile) for smile in smiles]

CPU times: user 2.01 s, sys: 59.1 ms, total: 2.07 s
Wall time: 2.1 s


# Get ECFP-4, and ECFP-6 (Morgan Fingerprints, radius 2 and 3, nBits=2048)

## ECFP-4

In [60]:
# get ecfp4
%time ecfp4 = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2, nBits=2048) for m in ms]

CPU times: user 529 ms, sys: 4.72 ms, total: 534 ms
Wall time: 536 ms


In [61]:
len(ecfp4), len(ecfp4[0])

(4200, 2048)

In [62]:
# record them to np.array
%time ecfp4_np = np.array(ecfp4)

CPU times: user 15.2 s, sys: 198 ms, total: 15.4 s
Wall time: 15.5 s


In [63]:
ecfp4_np.shape

(4200, 2048)

In [64]:
# record pandas DataFrame
%time ecfp4_pd = pd.DataFrame(ecfp4_np)

CPU times: user 875 µs, sys: 497 µs, total: 1.37 ms
Wall time: 4.14 ms


In [65]:
ecfp4_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
list(data.index)[:20]

['CHEMBL596271',
 'CHEMBL1951080',
 'CHEMBL1771',
 'CHEMBL234951',
 'CHEMBL565079',
 'CHEMBL317462',
 'CHEMBL1951182',
 'CHEMBL2030960',
 'CHEMBL13097',
 'CHEMBL1527751',
 'CHEMBL1940306',
 'CHEMBL578201',
 'CHEMBL276218',
 'CHEMBL2153181',
 'CHEMBL1916276',
 'CHEMBL93884',
 'CHEMBL2171986',
 'CHEMBL190044',
 'CHEMBL469790',
 'CHEMBL2165055']

In [67]:
ecfp4_pd.index = list(data.index)

In [68]:
ecfp4_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
ecfp4_pd.columns = [f'ecfp4-{i}' for i in range(len(ecfp4_pd.columns))]

In [70]:
ecfp4_pd.head()

Unnamed: 0,ecfp4-0,ecfp4-1,ecfp4-2,ecfp4-3,ecfp4-4,ecfp4-5,ecfp4-6,ecfp4-7,ecfp4-8,ecfp4-9,...,ecfp4-2038,ecfp4-2039,ecfp4-2040,ecfp4-2041,ecfp4-2042,ecfp4-2043,ecfp4-2044,ecfp4-2045,ecfp4-2046,ecfp4-2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## ECFP-6

In [71]:
# get ecfp6
%time ecfp6 = [AllChem.GetMorganFingerprintAsBitVect(m, radius=3, nBits=2048) for m in ms]

CPU times: user 887 ms, sys: 13.7 ms, total: 901 ms
Wall time: 1.18 s


In [72]:
len(ecfp6), len(ecfp6[0])

(4200, 2048)

In [73]:
# record them to np.array
%time ecfp6_np = np.array(ecfp6)

CPU times: user 15.2 s, sys: 217 ms, total: 15.4 s
Wall time: 15.6 s


In [74]:
ecfp6_np.shape

(4200, 2048)

In [75]:
# record pandas DataFrame
%time ecfp6_pd = pd.DataFrame(ecfp6_np)

CPU times: user 1.49 ms, sys: 1.52 ms, total: 3.01 ms
Wall time: 2.51 ms


In [76]:
ecfp6_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
list(data.index)[:20]

['CHEMBL596271',
 'CHEMBL1951080',
 'CHEMBL1771',
 'CHEMBL234951',
 'CHEMBL565079',
 'CHEMBL317462',
 'CHEMBL1951182',
 'CHEMBL2030960',
 'CHEMBL13097',
 'CHEMBL1527751',
 'CHEMBL1940306',
 'CHEMBL578201',
 'CHEMBL276218',
 'CHEMBL2153181',
 'CHEMBL1916276',
 'CHEMBL93884',
 'CHEMBL2171986',
 'CHEMBL190044',
 'CHEMBL469790',
 'CHEMBL2165055']

In [78]:
ecfp6_pd.index = list(data.index)

In [79]:
ecfp6_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
ecfp6_pd.columns = [f'ecfp6-{i}' for i in range(len(ecfp6_pd.columns))]

In [81]:
ecfp6_pd.head()

Unnamed: 0,ecfp6-0,ecfp6-1,ecfp6-2,ecfp6-3,ecfp6-4,ecfp6-5,ecfp6-6,ecfp6-7,ecfp6-8,ecfp6-9,...,ecfp6-2038,ecfp6-2039,ecfp6-2040,ecfp6-2041,ecfp6-2042,ecfp6-2043,ecfp6-2044,ecfp6-2045,ecfp6-2046,ecfp6-2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Saving both to .csv and to pickle files

In [82]:
# check the directory
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
freesolv_original_ecfp4_features.csv
freesolv_original_ecfp6_features.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv
lipophilicity_original_ecfp4_features.csv
lipophilicity_original_ecfp6_features.csv
lipophilicity_original_rdkit_features.csv
lipophilicity_original_smiles.smi
lipophilicity_original_smiles20.smi
lipophilicity_protonated_smiles20_messy.txt


In [83]:
%%time

ecfp4_pd.to_csv(f'../data/{dataset}_{smile_type}_ecfp4_features.csv')
ecfp6_pd.to_csv(f'../data/{dataset}_{smile_type}_ecfp6_features.csv')

CPU times: user 3.45 s, sys: 201 ms, total: 3.65 s
Wall time: 3.86 s


In [84]:
%ls

colab_rf_grid_search_on_fingerprints.ipynb
get_all_fingerprints_from_all_smiles.ipynb
get_fingerprints.ipynb
get_original_id_smile_target_esol.ipynb
get_original_id_smile_target_freeesolv.ipynb
get_original_id_smile_target_lipophilicity.ipynb
get_protenated_from_canonical.ipynb
get_rdkit_descriptors_freesolv_original.ipynb
modelling_pipeline.ipynb


In [85]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
freesolv_original_ecfp4_features.csv
freesolv_original_ecfp6_features.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv
lipophilicity_original_ecfp4_features.csv
lipophilicity_original_ecfp6_features.csv
lipophilicity_original_rdkit_features.csv
lipophilicity_original_smiles.smi
lipophilicity_original_smiles20.smi
lipophilicity_protonated_smiles20_messy.txt
