# Lipo_README

Lipophilicity is a dataset curated from ChEMBL database containing experimental results on octanol/water distribution coefficient (**logD** at pH=7.4). Due to the importance of lipophilicity in membrane permeability and solubility, the task is of high importance to drug development.

The data file contains a csv table, in which columns below are used:
- "smiles" - SMILES representation of the molecular structure
- "exp" - Measured octanol/water distribution coefficient (logD) of the compound, used as label

**Reference:**
Hersey, A. ChEMBL Deposited Data Set - AZ dataset; 2015. https://doi.org/10.6019/chembl3301361

In [217]:
%ls

Lipo_README                           get_fingerprints_local.ipynb
Lipophilicity.csv                     get_protenated_from_canonical.ipynb
cdk2smi1.smi                          logD.csv
ecfp4_features.csv                    protonated_smiles.smi
ecfp6_features.csv                    rf_grid_search_on_fingerprints.ipynb
example.smi                           smiles.smi
get_fingerprints.ipynb


# Import modules

In [218]:
import warnings
warnings.filterwarnings('ignore')

import pathlib

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

# Load Data

In [219]:
data = pd.read_csv('Lipophilicity.csv', index_col=0)
print(data.head(), '\n')
data.shape

                exp                                             smiles
CMPD_CHEMBLID                                                         
CHEMBL596271   3.54            Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080 -1.18  COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
CHEMBL1771     3.69             COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951   3.37  OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
CHEMBL565079   3.10  Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N... 



(4200, 2)

In [205]:
target = data['exp']
target[:5]

CMPD_CHEMBLID
CHEMBL596271     3.54
CHEMBL1951080   -1.18
CHEMBL1771       3.69
CHEMBL234951     3.37
CHEMBL565079     3.10
Name: exp, dtype: float64

In [206]:
# save target series
target.to_csv('logD.csv')

In [207]:
%ls

Lipo_README                           get_fingerprints.ipynb
Lipophilicity.csv                     get_fingerprints_local.ipynb
cdk2smi1.smi                          logD.csv
ecfp4_features.csv                    protonated_smiles.smi
ecfp6_features.csv                    rf_grid_search_on_fingerprints.ipynb
example.smi                           smiles.smi


In [208]:
smiles = data['smiles']
print(len(smiles))

smiles[:5]

4200


CMPD_CHEMBLID
CHEMBL596271               Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080    COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
CHEMBL1771                  COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951     OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
CHEMBL565079     Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
Name: smiles, dtype: object

In [209]:
smiles_list = list(smiles)

# Create molecules from smiles

In [211]:
%%time

# record molecules
ms = [Chem.MolFromSmiles(smile) for smile in smiles]

CPU times: user 2.15 s, sys: 100 ms, total: 2.25 s
Wall time: 2.29 s


# Get ECFP-4, and ECFP-6 (Morgan Fingerprints, radius 2 and 3, nBits=2048)

## ECFP-4

In [None]:
# get ecfp4
%time ecfp4 = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2, nBits=2048) for m in ms]

CPU times: user 195 ms, sys: 0 ns, total: 195 ms
Wall time: 198 ms


In [None]:
len(ecfp4), len(ecfp4[0])

(4200, 2048)

In [None]:
# record them to np.array
%time ecfp4_np = np.array(ecfp4)

CPU times: user 17.1 s, sys: 915 ms, total: 18 s
Wall time: 18 s


In [None]:
ecfp4_np.shape

(4200, 2048)

In [None]:
# record pandas DataFrame
%time ecfp4_pd = pd.DataFrame(ecfp4_np)

CPU times: user 850 µs, sys: 0 ns, total: 850 µs
Wall time: 775 µs


In [None]:
ecfp4_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
list(data.index)[:20]

['CHEMBL596271',
 'CHEMBL1951080',
 'CHEMBL1771',
 'CHEMBL234951',
 'CHEMBL565079',
 'CHEMBL317462',
 'CHEMBL1951182',
 'CHEMBL2030960',
 'CHEMBL13097',
 'CHEMBL1527751',
 'CHEMBL1940306',
 'CHEMBL578201',
 'CHEMBL276218',
 'CHEMBL2153181',
 'CHEMBL1916276',
 'CHEMBL93884',
 'CHEMBL2171986',
 'CHEMBL190044',
 'CHEMBL469790',
 'CHEMBL2165055']

In [None]:
ecfp4_pd.index = list(data.index)

In [None]:
ecfp4_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## ECFP-6

In [None]:
# get ecfp6
%time ecfp6 = [AllChem.GetMorganFingerprintAsBitVect(m, radius=3, nBits=2048) for m in ms]

CPU times: user 301 ms, sys: 1.81 ms, total: 303 ms
Wall time: 305 ms


In [None]:
len(ecfp6), len(ecfp6[0])

(4200, 2048)

In [None]:
# record them to np.array
%time ecfp6_np = np.array(ecfp6)

CPU times: user 17.2 s, sys: 880 ms, total: 18.1 s
Wall time: 18 s


In [None]:
ecfp6_np.shape

(4200, 2048)

In [None]:
# record pandas DataFrame
%time ecfp6_pd = pd.DataFrame(ecfp6_np)

CPU times: user 589 µs, sys: 35 µs, total: 624 µs
Wall time: 905 µs


In [None]:
ecfp6_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
list(data.index)[:20]

['CHEMBL596271',
 'CHEMBL1951080',
 'CHEMBL1771',
 'CHEMBL234951',
 'CHEMBL565079',
 'CHEMBL317462',
 'CHEMBL1951182',
 'CHEMBL2030960',
 'CHEMBL13097',
 'CHEMBL1527751',
 'CHEMBL1940306',
 'CHEMBL578201',
 'CHEMBL276218',
 'CHEMBL2153181',
 'CHEMBL1916276',
 'CHEMBL93884',
 'CHEMBL2171986',
 'CHEMBL190044',
 'CHEMBL469790',
 'CHEMBL2165055']

In [None]:
ecfp6_pd.index = list(data.index)

In [None]:
ecfp6_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
CHEMBL596271,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL1951080,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
CHEMBL1771,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
CHEMBL234951,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
CHEMBL565079,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Saving both to .csv and to pickle files

In [None]:
# check the directory
%ls

get_fingerprints.ipynb  Lipophilicity.csv  Lipo_README


In [None]:
%%time

ecfp4_pd.to_csv('ecfp4_features.csv')
ecfp6_pd.to_csv('ecfp6_features.csv')

CPU times: user 2.1 s, sys: 39.7 ms, total: 2.14 s
Wall time: 2.28 s


In [None]:
%ls

ecfp4_features      get_fingerprints.ipynb
ecfp4_features.csv  Lipophilicity.csv
ecfp6_features      Lipo_README
ecfp6_features.csv  rf_grid_search_on_fingerprints.ipynb
