In [1]:
%ls

get_protenated_from_canonical.ipynb


In [2]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original_IdSmilesLabels.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv


# Import modules

In [3]:
import pathlib

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolfiles import SmilesWriter

# Load Data

In [7]:
dataset = 'lipophilicity'

In [8]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original_IdSmilesLabels.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv


In [9]:
data = pd.read_csv(f'../data/{dataset}_original_IdSmilesLabels.csv')
print(data.head(), '\n')
data.shape

              id                                             smiles  labels
0   CHEMBL596271            Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14    3.54
1  CHEMBL1951080  COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...   -1.18
2     CHEMBL1771             COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl    3.69
3   CHEMBL234951  OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...    3.37
4   CHEMBL565079  Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...    3.10 



(4200, 3)

# Create molecules from smiles

In [11]:
data['smiles'][0]

'Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14'

In [12]:
data['id'][0]

'CHEMBL596271'

In [14]:
# write the original molecules in the .smi file
ms = []

# 2.5 seconds for lipophilicity
for i in range(data.shape[0]):
    m = Chem.MolFromSmiles(data['smiles'][i])
    m.SetProp("_Name", data['id'][i])
    ms.append(m)

# Write the molecules in the `smiles.smi` file

## All molecules

In [15]:
#The way of writing molecules can perform common way.
writer = SmilesWriter(f'../data/{dataset}_original_smiles.smi')

for m in ms:
    writer.write(m)
    

writer.close()

In [16]:
from IPython.display import clear_output

In [17]:
# check how we can read the molecules
check_suppl = Chem.SmilesMolSupplier(f'../data/{dataset}_original_smiles.smi')

for mol in check_suppl:
    print(Chem.MolToSmiles(mol))


clear_output()

## 20 molecules (for speedup)

In [18]:
# get a small subset of molecules - will work with them for now (faster)
ms20 = ms[:20]

In [19]:
# small version to run fast
writer = SmilesWriter(f'../data/{dataset}_original_smiles20.smi')

#The way of writing molecules can perform common way.
for m in ms20:
    writer.write(m)
writer.close()

In [20]:
%ls ../data/

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original_IdSmilesLabels.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv
lipophilicity_original_smiles.smi
lipophilicity_original_smiles20.smi


# Compare smiles written from the molecules to the original smiles

In [22]:
smiles_list = list(data.smiles)

print(smiles_list[:5])

['Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14', 'COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23', 'COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl', 'OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(Cl)sc4[nH]3', 'Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)NCC#N)c1']


In [23]:
check_suppl = Chem.SmilesMolSupplier(f'../data/{dataset}_original_smiles.smi')
check_smiles_list = [Chem.MolToSmiles(mol) for mol in check_suppl if mol != None]

print(len(check_smiles_list))
check_smiles_list[:5]

4200


['Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21',
 'COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)O',
 'COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1',
 'O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl)sc2[nH]1',
 'Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)NCC#N)c1']

In [24]:
set(check_smiles_list) == set(smiles_list)

False

In [25]:
len(set(check_smiles_list).intersection(set(smiles_list)))

354

In [26]:
len(smiles_list)

4200

In [27]:
print("Original:\n", smiles_list[0])
Chem.MolToSmiles(Chem.MolFromSmiles(smiles_list[0]))

Original:
 Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14


'Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21'

In [28]:
print("After:\n", check_smiles_list[0])
Chem.MolToSmiles(Chem.MolFromSmiles(check_smiles_list[0]))

After:
 Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21


'Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21'

In [29]:
[Chem.MolToSmiles(Chem.MolFromSmiles(smiles_list[i])) for i in range(len(smiles_list))] == check_smiles_list

True

The `check_smiles_list` and `smiles_list` are the same except that `check_smiles_list` has all the smiles in the canonical form

# Create a file with protonated smiles

In [30]:
# check the directory of dimorphite_dl.py file

%ls ../../../packages/dimorphite_dl-1.2.4/

[31mCHANGES.md[m[m*                [34m__pycache__[m[m/
[31mCONTRIBUTORS.md[m[m*           [31mdimorphite_dl.py[m[m*
[31mLICENSE.txt[m[m*               [31msample_molecules.smi[m[m*
[31mREADME.md[m[m*                 [31msite_substructures.smarts[m[m*
[31m__init__.py[m[m*               [34mtraining_data[m[m/


In [31]:
# print output (do not write the file)
! python ../../../packages/dimorphite_dl-1.2.4/dimorphite_dl.py --smiles_file ../data/lipophilicity_original_smiles20.smi --min_ph 7.4 --max_ph 7.4


For help, use: python dimorphite_dl.py --help

If you use Dimorphite-DL in your research, please cite:
Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An
open-source program for enumerating the ionization states of drug-like small
molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.


PARAMETERS:

 label_states: False
       max_ph: 7.4
 max_variants: 128
       min_ph: 7.4
  output_file: None
pka_precision: 1.0
       silent: False
       smiles: None
  smiles_file: ../data/lipophilicity_original_smiles20.smi
         test: False


Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21	CHEMBL596271
Cn1c(C[NH+]2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21	CHEMBL596271
COc1cc(OC)c(S(=O)(=O)[NH+]2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)[O-]	CHEMBL1951080
COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)[O-]	CHEMBL1951080
COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1	CHEMBL1771
COC(=O)[C@H](c1ccccc1Cl)[NH+]1CCc2sccc2C1	CHEMBL1771
O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl)sc2[nH]1	CHEMBL234951


In [32]:
! python ../../../packages/dimorphite_dl-1.2.4/dimorphite_dl.py --smiles_file ../data/lipophilicity_original_smiles20.smi --min_ph 7.4 --max_ph 7.4 --silent

Cn1c(C[NH+]2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21	CHEMBL596271
Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21	CHEMBL596271
COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)[O-]	CHEMBL1951080
COc1cc(OC)c(S(=O)(=O)[NH+]2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)[O-]	CHEMBL1951080
COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1	CHEMBL1771
COC(=O)[C@H](c1ccccc1Cl)[NH+]1CCc2sccc2C1	CHEMBL1771
O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl)sc2[nH]1	CHEMBL234951
Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)NCC#N)c1	CHEMBL565079
OC1(C#Cc2ccc(-c3ccccc3)cc2)C[NH+]2CCC1CC2	CHEMBL317462
OC1(C#Cc2ccc(-c3ccccc3)cc2)CN2CCC1CC2	CHEMBL317462
COc1cc(OC)c(S(=O)(=O)NCc2ccccc2N2CCCCC2)cc1NC(=O)CCC(=O)[O-]	CHEMBL1951182
COc1cc(OC)c(S(=O)(=O)[N-]Cc2ccccc2N2CCCCC2)cc1NC(=O)CCC(=O)[O-]	CHEMBL1951182
CNc1cccc(CCOc2ccc(C[C@H](NC(=O)c3c(Cl)cccc3Cl)C(=O)[O-])cc2C)n1	CHEMBL2030960
COc1ccc(-c2coc3cc(OC)cc(OC)c3c2=O)cc1	CHEMBL13097
[O-]c1ncnc2scc(-c3ccsc3)c12	CHEMBL1527751
Oc1ncnc2scc(-c3ccsc3)c12	CHEMBL1527751
CS(=O)(=O)c1ccc(Oc2ccc(C#C[C@]3(O)

In [33]:
# print output in silent mode (do not write the file)
! python ../../../packages/dimorphite_dl-1.2.4/dimorphite_dl.py --smiles_file ../data/lipophilicity_original_smiles20.smi --min_ph 7.4 --max_ph 7.4 --silent > ../data/lipophilicity_protonated_smiles20_messy.txt

In [34]:
%ls ../data

ESOL_README
FreeSolv_README
Lipo_README
esol_original.csv
esol_original_1024ecfp4_features.csv
esol_original_1024ecfp6_features.csv
esol_original_2048ecfp4_features.csv
esol_original_2048ecfp6_features.csv
esol_original_IdSmilesLabels.csv
esol_original_extra_features.csv
esol_original_rdkit_features.csv
freesolv_original.csv
freesolv_original_1024ecfp4_features.csv
freesolv_original_1024ecfp6_features.csv
freesolv_original_2048ecfp4_features.csv
freesolv_original_2048ecfp6_features.csv
freesolv_original_IdSmilesLabels.csv
freesolv_original_rdkit_features.csv
lipophilicity_original.csv
lipophilicity_original_1024ecfp4_features.csv
lipophilicity_original_1024ecfp6_features.csv
lipophilicity_original_2048ecfp4_features.csv
lipophilicity_original_2048ecfp6_features.csv
lipophilicity_original_IdSmilesLabels.csv
lipophilicity_original_rdkit_features.csv
lipophilicity_original_smiles.smi
lipophilicity_original_smiles20.smi
lipophilicity_protonated_smiles20_messy.txt

In [35]:
smiles = []
names = []

with open(f'../data/{dataset}_protonated_smiles20_messy.txt', 'r') as f:
    for line in f:
        #print(line.strip().split())
        smile, name = line.strip().split()
        smiles.append(smile)
        names.append(name)

In [43]:
df = pd.DataFrame({'id': names, 'smile': smiles})

In [44]:
df

Unnamed: 0,id,smile
0,CHEMBL596271,Cn1c(C[NH+]2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21
1,CHEMBL596271,Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21
2,CHEMBL1951080,COc1cc(OC)c(S(=O)(=O)[NH+]2c3ccccc3CCC2C)cc1NC...
3,CHEMBL1951080,COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)...
4,CHEMBL1771,COC(=O)[C@H](c1ccccc1Cl)[NH+]1CCc2sccc2C1
5,CHEMBL1771,COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1
6,CHEMBL234951,O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl...
7,CHEMBL565079,Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)N...
8,CHEMBL317462,OC1(C#Cc2ccc(-c3ccccc3)cc2)C[NH+]2CCC1CC2
9,CHEMBL317462,OC1(C#Cc2ccc(-c3ccccc3)cc2)CN2CCC1CC2


In [51]:
for s in df[df.id == 'CHEMBL578201'].smile:
    print(s)

Cc1cc(Nc2nc([NH2+][C@@H](C)c3ccc(F)cn3)c(C#N)nc2C)n[n-]1
Cc1cc(Nc2nc([NH2+][C@@H](C)c3ccc(F)cn3)c(C#N)nc2C)n[nH]1
Cc1cc(Nc2nc(N[C@@H](C)c3ccc(F)cn3)c(C#N)nc2C)n[nH]1
Cc1cc(Nc2nc(N[C@@H](C)c3ccc(F)cn3)c(C#N)nc2C)n[n-]1


In [47]:
df.id.value_counts()

CHEMBL578201     4
CHEMBL2165055    4
CHEMBL317462     2
CHEMBL1951182    2
CHEMBL1951080    2
CHEMBL596271     2
CHEMBL2153181    2
CHEMBL2171986    2
CHEMBL93884      2
CHEMBL1771       2
CHEMBL1916276    2
CHEMBL1527751    2
CHEMBL1940306    2
CHEMBL190044     2
CHEMBL469790     1
CHEMBL2030960    1
CHEMBL565079     1
CHEMBL13097      1
CHEMBL234951     1
CHEMBL276218     1
Name: id, dtype: int64