In [2]:
%ls

get_all_fingerprints_from_all_smiles.ipynb
get_fingerprints_lipophilicity_original.ipynb
get_original_id_smile_target_esol.ipynb
get_original_id_smile_target_freeesolv.ipynb
get_original_id_smile_target_lipophilicity.ipynb
get_protenated_from_canonical.ipynb
get_rdkit_descriptors_freesolv_original.ipynb
rf_grid_search_on_fingerprints.ipynb


In [3]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_IdSmileTarget.csv
esol_original_ecfp4_features.csv
esol_original_ecfp6_features.csv
esol_original_extra_features.csv
freesolv_original.csv
freesolv_original_IdSmileTarget.csv
freesolv_original_ecfp4_features.csv
freesolv_original_ecfp6_features.csv
lipophilicity_original.csv
lipophilicity_original_IdSmileTarget.csv
lipophilicity_original_ecfp4_features.csv
lipophilicity_original_ecfp6_features.csv


# Import modules

In [11]:
import pathlib

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolfiles import SmilesWriter

# Load Data

In [None]:
dataset = 'lipophilicity'

In [3]:
data = pd.read_csv(f'../data/{dataset}_original_IdSmileTarget.csv', index_col=0)
print(data.head(), '\n')
data.shape

                exp                                             smiles
CMPD_CHEMBLID                                                         
CHEMBL596271   3.54            Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080 -1.18  COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
CHEMBL1771     3.69             COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951   3.37  OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
CHEMBL565079   3.10  Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N... 



(4200, 2)

In [4]:
target = data['exp']
target[:5]

CMPD_CHEMBLID
CHEMBL596271     3.54
CHEMBL1951080   -1.18
CHEMBL1771       3.69
CHEMBL234951     3.37
CHEMBL565079     3.10
Name: exp, dtype: float64

In [5]:
# save target series
target.to_csv('logD.csv')

In [6]:
%ls

Lipo_README                           get_fingerprints_local.ipynb
Lipophilicity.csv                     get_protenated_from_canonical.ipynb
cdk2smi1.smi                          logD.csv
ecfp4_features.csv                    protonated_smiles.smi
ecfp6_features.csv                    rf_grid_search_on_fingerprints.ipynb
example.smi                           smiles.smi
get_fingerprints.ipynb


In [57]:
smiles = data['smiles']
print(len(smiles))

smiles[:5]

4200


CMPD_CHEMBLID
CHEMBL596271               Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
CHEMBL1951080    COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
CHEMBL1771                  COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
CHEMBL234951     OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
CHEMBL565079     Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
Name: smiles, dtype: object

In [58]:
smiles_list = list(smiles)

In [59]:
smiles_list100 = smiles_list[:100]

# Create molecules from smiles

In [41]:
%%time

# record molecules
ms = [Chem.MolFromSmiles(smile) for smile in smiles_list]

CPU times: user 2.09 s, sys: 225 ms, total: 2.32 s
Wall time: 2.38 s


In [42]:
%%time

# record molecules
ms100 = ms[:100]

CPU times: user 9 µs, sys: 1e+03 ns, total: 10 µs
Wall time: 14.1 µs


# Write the molecules in the `smiles.smi` file

In [43]:
writer = SmilesWriter('smiles.smi')

In [44]:
#The way of writing molecules can perform common way.
for m in ms:
    writer.write(m)
writer.close()

In [45]:
%ls

Lipo_README                           get_fingerprints_local.ipynb
Lipophilicity.csv                     get_protenated_from_canonical.ipynb
cdk2smi1.smi                          logD.csv
ecfp4_features.csv                    protonated_smiles.smi
ecfp6_features.csv                    protonated_smiles_messy.txt
example.smi                           rf_grid_search_on_fingerprints.ipynb
get_fingerprints.ipynb                smiles.smi


In [46]:
# small version to run fast
writer = SmilesWriter('smiles100.smi')

#The way of writing molecules can perform common way.
for m in ms100:
    writer.write(m)
writer.close()

In [47]:
%ls

Lipo_README                           get_protenated_from_canonical.ipynb
Lipophilicity.csv                     logD.csv
cdk2smi1.smi                          protonated_smiles.smi
ecfp4_features.csv                    protonated_smiles_messy.txt
ecfp6_features.csv                    rf_grid_search_on_fingerprints.ipynb
example.smi                           smiles.smi
get_fingerprints.ipynb                smiles100.smi
get_fingerprints_local.ipynb


# Compare smiles written from the molecules to the original smiles

In [15]:
check_suppl = Chem.SmilesMolSupplier('smiles.smi')

In [16]:
%%time

check_smiles_list = [Chem.MolToSmiles(mol) for mol in check_suppl if mol != None]
len(check_smiles_list)
check_smiles_list[:5]

CPU times: user 3.15 s, sys: 89.3 ms, total: 3.24 s
Wall time: 3.31 s


['Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21',
 'COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)O',
 'COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1',
 'O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl)sc2[nH]1',
 'Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)NCC#N)c1']

In [17]:
set(check_smiles_list) == set(smiles_list)

False

In [18]:
len(set(check_smiles_list).intersection(set(smiles_list)))

354

In [19]:
len(smiles_list)

4200

In [20]:
print("Original:\n", smiles_list[0])
Chem.MolToSmiles(Chem.MolFromSmiles(smiles_list[0]))

Original:
 Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14


'Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21'

In [21]:
print("After:\n", check_smiles_list[0])
Chem.MolToSmiles(Chem.MolFromSmiles(check_smiles_list[0]))

After:
 Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21


'Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21'

In [22]:
[Chem.MolToSmiles(Chem.MolFromSmiles(smiles_list[i])) for i in range(len(smiles_list))] == check_smiles_list

True

The `check_smiles_list` and `smiles_list` are the same except that `check_smiles_list` has all the smiles in the canonical form

# Create a file with protonated smiles

In [48]:
# check the directory of dimorphite_dl.py file

%ls ../../../packages/dimorphite_dl-1.2.4/

[31mCHANGES.md[m[m*                [34m__pycache__[m[m/
[31mCONTRIBUTORS.md[m[m*           [31mdimorphite_dl.py[m[m*
[31mLICENSE.txt[m[m*               [31msample_molecules.smi[m[m*
[31mREADME.md[m[m*                 [31msite_substructures.smarts[m[m*
[31m__init__.py[m[m*               [34mtraining_data[m[m/


In [49]:
%%time

! python ../../../packages/dimorphite_dl-1.2.4/dimorphite_dl.py --smiles_file smiles100.smi --min_ph 7.4 --max_ph 7.4 --output_file protonated_smiles100.smi


For help, use: python dimorphite_dl.py --help

If you use Dimorphite-DL in your research, please cite:
Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An
open-source program for enumerating the ionization states of drug-like small
molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.


PARAMETERS:

 label_states: False
       max_ph: 7.4
 max_variants: 128
       min_ph: 7.4
  output_file: protonated_smiles100.smi
pka_precision: 1.0
       silent: False
       smiles: None
  smiles_file: smiles100.smi
         test: False


CPU times: user 41.3 ms, sys: 22.2 ms, total: 63.5 ms
Wall time: 2.21 s


In [50]:
%%time

! python ../../../packages/dimorphite_dl-1.2.4/dimorphite_dl.py --smiles_file smiles100.smi --min_ph 7.4 --max_ph 7.4


For help, use: python dimorphite_dl.py --help

If you use Dimorphite-DL in your research, please cite:
Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An
open-source program for enumerating the ionization states of drug-like small
molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.


PARAMETERS:

 label_states: False
       max_ph: 7.4
 max_variants: 128
       min_ph: 7.4
  output_file: None
pka_precision: 1.0
       silent: False
       smiles: None
  smiles_file: smiles100.smi
         test: False


Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21	0
Cn1c(C[NH+]2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21	0
COc1cc(OC)c(S(=O)(=O)[NH+]2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)[O-]	1
COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)[O-]	1
COC(=O)[C@H](c1ccccc1Cl)[NH+]1CCc2sccc2C1	2
COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1	2
O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl)sc2[nH]1	3
Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)NCC#N)c1	4
OC1(C#Cc2ccc(-c3ccccc3)cc2)CN2CCC1CC2	5
OC1(C#Cc2

COCC[NH2+]Cc1ccc(CC[NH2+]C[C@H](O)c2ccc([O-])c3[n-]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CCNC[C@H](O)c2ccc([O-])c3[nH]c(=O)sc23)cc1	71
COCCNCc1ccc(CCNC[C@H](O)c2ccc([O-])c3[nH]c(=O)sc23)cc1	71
COCCNCc1ccc(CC[NH2+]C[C@H](O)c2ccc([O-])c3[nH]c(=O)sc23)cc1	71
COCCNCc1ccc(CCNC[C@H](O)c2ccc(O)c3[n-]c(=O)sc23)cc1	71
COCCNCc1ccc(CC[NH2+]C[C@H](O)c2ccc([O-])c3[n-]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CCNC[C@H](O)c2ccc([O-])c3[n-]c(=O)sc23)cc1	71
COCCNCc1ccc(CC[NH2+]C[C@H](O)c2ccc(O)c3[nH]c(=O)sc23)cc1	71
COCCNCc1ccc(CC[NH2+]C[C@H](O)c2ccc(O)c3[n-]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CC[NH2+]C[C@H](O)c2ccc([O-])c3[nH]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CC[NH2+]C[C@H](O)c2ccc(O)c3[nH]c(=O)sc23)cc1	71
COCCNCc1ccc(CCNC[C@H](O)c2ccc([O-])c3[n-]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CCNC[C@H](O)c2ccc(O)c3[nH]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CC[NH2+]C[C@H](O)c2ccc(O)c3[n-]c(=O)sc23)cc1	71
COCC[NH2+]Cc1ccc(CCNC[C@H](O)c2ccc(O)c3[n-]c(=O)sc23)cc1	71
COCCNCc1ccc(CCNC[C@H](O)c2ccc(O)c3[nH]c(=O)sc23)cc1	71
C#CCn1c(=O)c2c(-

In [55]:
%%time

! python ../../../packages/dimorphite_dl-1.2.4/dimorphite_dl.py --smiles_file smiles100.smi --min_ph 7.4 --max_ph 7.4 --silent > protonated_smiles100_messy.txt

CPU times: user 44.3 ms, sys: 27 ms, total: 71.3 ms
Wall time: 2.29 s


In [56]:
%ls

Lipo_README                           logD.csv
Lipophilicity.csv                     protonated_smiles.smi
cdk2smi1.smi                          protonated_smiles100.smi
ecfp4_features.csv                    protonated_smiles100_messy.txt
ecfp6_features.csv                    protonated_smiles_messy.txt
example.smi                           rf_grid_search_on_fingerprints.ipynb
get_fingerprints.ipynb                smiles.smi
get_fingerprints_local.ipynb          smiles100.smi
get_protenated_from_canonical.ipynb


In [53]:
prot_suppl = Chem.SmilesMolSupplier('protonated_smiles.smi')

In [54]:
%%time

prot_smiles_list = [Chem.MolToSmiles(mol) for mol in prot_suppl if mol != None]
print(len(prot_smiles_list))
prot_smiles_list[:5]

0
CPU times: user 1.7 s, sys: 527 ms, total: 2.23 s
Wall time: 2.36 s


[]

In [None]:
with open('protonated_smiles100_messy.txt', 'r') as f:
    prot100 = []