In [33]:
%ls

esol_get_original_id_smile_target.ipynb
freeesolv_get_original_id_smile_target.ipynb
get_all_fingerprints_for_all_datasets.ipynb
get_fingerprints.ipynb
get_fingerprints_local.ipynb
get_protenated_from_canonical.ipynb
lipophilicity_get_original_id_smile_target.ipynb
rf_grid_search_on_fingerprints.ipynb


In [34]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv


# Import modules

In [4]:
import warnings
warnings.filterwarnings('ignore')

import pathlib

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem

In [6]:
for dataset in ['esol', 'freesolv', 'lipophilicity']:
    for smile_type in ['original']:
        pass
        df = pd.read_csv()

TypeError: parser_f() missing 1 required positional argument: 'filepath_or_buffer'

In [7]:
print(dataset, smile_type)

esol original


# Load Data

In [8]:
data = pd.read_csv(f'../data/{dataset}_{smile_type}_IdSmileTarget.csv', index_col=0)
print(data.head(), '\n')
data.shape

                                                       smile  target
id                                                                  
Amigdalin  OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...   -0.77
Fenfuram                              Cc1occc1C(=O)Nc2ccccc2   -3.30
citral                                  CC(C)=CCCC(C)=CC(=O)   -2.06
Picene                    c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43   -7.87
Thiophene                                            c1ccsc1   -1.33 



(1128, 2)

In [9]:
smiles = data['smile']
print(len(smiles))

smiles[:5]

1128


id
Amigdalin    OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
Fenfuram                                Cc1occc1C(=O)Nc2ccccc2
citral                                    CC(C)=CCCC(C)=CC(=O)
Picene                      c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
Thiophene                                              c1ccsc1
Name: smile, dtype: object

In [10]:
smiles_list = list(smiles)

In [11]:
%%time

# record molecules
ms = [Chem.MolFromSmiles(smile) for smile in smiles]

CPU times: user 302 ms, sys: 12.7 ms, total: 315 ms
Wall time: 375 ms


In [12]:
# get ecfp4
%time ecfp4 = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2, nBits=2048) for m in ms]

CPU times: user 90.2 ms, sys: 4.42 ms, total: 94.6 ms
Wall time: 94.2 ms


In [13]:
len(ecfp4), len(ecfp4[0])

(1128, 2048)

In [14]:
# record them to np.array
%time ecfp4_np = np.array(ecfp4)

CPU times: user 3.99 s, sys: 43.1 ms, total: 4.03 s
Wall time: 4.06 s


In [15]:
ecfp4_np.shape

(1128, 2048)

In [16]:
# record pandas DataFrame
%time ecfp4_pd = pd.DataFrame(ecfp4_np)

CPU times: user 587 µs, sys: 191 µs, total: 778 µs
Wall time: 787 µs


In [17]:
ecfp4_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
list(data.index)[:20]

['Amigdalin',
 'Fenfuram',
 'citral',
 'Picene',
 'Thiophene',
 'benzothiazole',
 "2,2,4,6,6'-PCB",
 'Estradiol',
 'Dieldrin',
 'Rotenone',
 '2-pyrrolidone',
 '2-Chloronapthalene',
 '1-Pentene ',
 'Primidone',
 'Tetradecane',
 '2-Chloropropane',
 '2-Methylbutanol',
 'Benzonitrile',
 'Diazinon',
 '2-Undecanol']

In [19]:
ecfp4_pd.index = list(data.index)

In [20]:
ecfp4_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
Amigdalin,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fenfuram,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citral,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Picene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Thiophene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## ECFP-6

In [21]:
# get ecfp6
%time ecfp6 = [AllChem.GetMorganFingerprintAsBitVect(m, radius=3, nBits=2048) for m in ms]

CPU times: user 105 ms, sys: 3.58 ms, total: 108 ms
Wall time: 107 ms


In [22]:
len(ecfp6), len(ecfp6[0])

(1128, 2048)

In [23]:
# record them to np.array
%time ecfp6_np = np.array(ecfp6)

CPU times: user 4.17 s, sys: 58.5 ms, total: 4.22 s
Wall time: 4.3 s


In [24]:
ecfp6_np.shape

(1128, 2048)

In [25]:
# record pandas DataFrame
%time ecfp6_pd = pd.DataFrame(ecfp6_np)

CPU times: user 755 µs, sys: 347 µs, total: 1.1 ms
Wall time: 842 µs


In [26]:
ecfp6_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
list(data.index)[:20]

['Amigdalin',
 'Fenfuram',
 'citral',
 'Picene',
 'Thiophene',
 'benzothiazole',
 "2,2,4,6,6'-PCB",
 'Estradiol',
 'Dieldrin',
 'Rotenone',
 '2-pyrrolidone',
 '2-Chloronapthalene',
 '1-Pentene ',
 'Primidone',
 'Tetradecane',
 '2-Chloropropane',
 '2-Methylbutanol',
 'Benzonitrile',
 'Diazinon',
 '2-Undecanol']

In [28]:
ecfp6_pd.index = list(data.index)

In [29]:
ecfp6_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
Amigdalin,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fenfuram,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
citral,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Picene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Thiophene,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Saving both to .csv and to pickle files

In [30]:
# check the directory
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_extra.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv


In [31]:
%%time

ecfp4_pd.to_csv(f'../data/{dataset}_{smile_type}_ecfp4_features.csv', index=True)
ecfp6_pd.to_csv(f'../data/{dataset}_{smile_type}_ecfp6_features.csv', index=True)

CPU times: user 891 ms, sys: 69.5 ms, total: 960 ms
Wall time: 1.01 s


In [32]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv
