CURATION SCRIPT
by: Igor Sanches

**IMPORT DEPENDENCIES AND IN-HOUSE FUNCTIONS**

In [1]:
#import libraries and dependencies

import pandas as pd
import math
import numpy as np

from rdkit import Chem
from chembl_structure_pipeline import standardizer
from rdkit.Chem.MolStandardize.metal import MetalDisconnector
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
from rdkit.Chem import PandasTools

from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import inchi as rd_inchi

from molvs import standardize_smiles
from molvs import Standardizer
from rdkit.Chem import Draw

In [23]:
#in-house functions
def metal_atomic_numbers(at):
    """ This function checks the atomic number of an atom """
    
    n = at.GetAtomicNum()
    return (n==13) or (n>=21 and n<=31) or (n>=39 and n<=50) or (n>=57 and n<=83) or (n>=89 and n<=115)

def is_metal(smile):
    """ This function checks if an atom is a metal based on its atomic number """
    mol = Chem.MolFromSmiles(smile)
    rwmol = Chem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)
    metal = [at.GetSymbol() for at in rwmol.GetAtoms() if metal_atomic_numbers(at)]
    return len(metal) == 1

def smiles_preparator(smile):
    """ This function prepares smiles by removing stereochemistry """
    smile1 = smile.replace('@','')
    smile2 = smile1.replace('/','')
    smile3 = smile2.replace("\\",'')
    return str(smile3)

def salt_remover(mol):
    """ This function removes salts, see complete list of possible salts in https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt """
    remover1 = SaltRemover(defnData=None)
    remover2 = SaltRemover(defnData="[Cl,Br,I]")
    remover3 = SaltRemover(defnData="[Li,Na,K,Ca,Mg]")
    remover4 = SaltRemover(defnData="[O,N]")
    remover5 = SaltRemover(defnData="[H]")
    remover6 = SaltRemover(defnData="[Ba]")
    remover7 = SaltRemover(defnData="[Al]")
    remover8 = SaltRemover(defnData="[Cu]")
    remover9 = SaltRemover(defnData="[Cs]")
    remover10 = SaltRemover(defnData="[Zn]")
    remover11 = SaltRemover(defnData="[Mn]")
    remover12 = SaltRemover(defnData="Cl[Cr]Cl")
    remover13 = SaltRemover(defnData="COS(=O)(=O)[O-]")
    remover14 = SaltRemover(defnData="[Sb]")
    remover15 = SaltRemover(defnData="[Cr]")
    remover16 = SaltRemover(defnData="[Ni]")
    remover17 = SaltRemover(defnData="[B]")
    stripped1 = remover1.StripMol(mol, dontRemoveEverything=True)
    stripped2 = remover2.StripMol(stripped1, dontRemoveEverything=True)
    stripped3 = remover3.StripMol(stripped2, dontRemoveEverything=True)
    stripped4 = remover4.StripMol(stripped3, dontRemoveEverything=True)
    stripped5 = remover5.StripMol(stripped4, dontRemoveEverything=True)
    stripped6 = remover6.StripMol(stripped5, dontRemoveEverything=True)
    stripped7 = remover7.StripMol(stripped6, dontRemoveEverything=True)
    stripped8 = remover8.StripMol(stripped7, dontRemoveEverything=True)
    stripped9 = remover9.StripMol(stripped8, dontRemoveEverything=True)
    stripped10 = remover10.StripMol(stripped9, dontRemoveEverything=True)
    stripped11 = remover11.StripMol(stripped10, dontRemoveEverything=True)
    stripped12 = remover12.StripMol(stripped11, dontRemoveEverything=True)
    stripped13 = remover13.StripMol(stripped12, dontRemoveEverything=True)
    stripped14 = remover14.StripMol(stripped13, dontRemoveEverything=True)
    stripped15 = remover15.StripMol(stripped14, dontRemoveEverything=True)
    stripped16 = remover16.StripMol(stripped15, dontRemoveEverything=True)
    stripped17 = remover17.StripMol(stripped16, dontRemoveEverything=True)
    return stripped17


**SET PATH**

In [3]:
#choose a path to save
savepath = r'D:\python\jupyterscripts\curation'

In [38]:
#import dataset
df0 = pd.read_csv(r'D:\hergproject\ChEMBLv29cmpds\single protein format\precurated\single protein format - (IC50only) precuration.csv', encoding='latin-1')
df0

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,Assay ChEMBL ID,Assay Description
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,IC50,'=',645.0,nM,CHEMBL962309,Displacement of [3H]dofetilide from human ERG ...
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,IC50,'=',21400.0,nM,CHEMBL1120514,Displacement of [3H]dofetilide from human ERG
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,IC50,'=',20800.0,nM,CHEMBL1120514,Displacement of [3H]dofetilide from human ERG
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,IC50,'=',47600.0,nM,CHEMBL3598727,Inhibition of [35S]MK499 binding to human ERG ...
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,IC50,'=',16940.0,nM,CHEMBL3598727,Inhibition of [35S]MK499 binding to human ERG ...
...,...,...,...,...,...,...,...,...
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,IC50,'=',800.0,nM,CHEMBL4050158,Inhibition of MK499 binding to human ERG
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,IC50,'=',40000.0,nM,CHEMBL3399089,Inhibition of human ERG
5117,CHEMBL4633573,Cn1nccc1[C@H]1CCC[C@@H]1Oc1cc(F)c(S(=O)(=O)Nc2...,IC50,'>',100000.0,nM,CHEMBL4623822,Inhibition of human ERG by Ionworks high-throu...
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,IC50,'=',26000.0,nM,CHEMBL3999906,Inhibition of human ERG


**DATA PREPARATION**

In [39]:
#remove unwanted columns
dropList = ['Standard Type', 'Standard Units', 'Assay ChEMBL ID', 'Assay Description']
df1 = df0.drop(columns = dropList)
df1

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Relation,Standard Value
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,'=',645.0
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,'=',21400.0
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,'=',20800.0
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,'=',47600.0
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,'=',16940.0
...,...,...,...,...
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,'=',800.0
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,'=',40000.0
5117,CHEMBL4633573,Cn1nccc1[C@H]1CCC[C@@H]1Oc1cc(F)c(S(=O)(=O)Nc2...,'>',100000.0
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,'=',26000.0


In [40]:
#rename columns
df1.rename(columns = {'Molecule ChEMBL ID':'ID', 'Smiles':'SMILES', 'Standard Value':'IC50 (nM)', 'Standard Relation':'Relation'}, inplace = True)

In [41]:
#check for unique values before deleting
df1['Relation'].unique()

array(["'='", "'>'", "'<'", nan, "'>='"], dtype=object)

In [42]:
#drop rows with missing values (only drop rows with missing values on activity column)
df2 = df1.dropna(subset=['IC50 (nM)'])

#total removed with activity missing values 
total_removed1 = len(df1)-len(df2)

#drop rows with missing values (only drop rows with missing values on relation column)
df3 = df2.dropna(subset=['Relation'])

#total removed with relation missing values 
total_removed2 = len(df2)-len(df3)

In [43]:
#drop all values != '='
df4 = df3[df3.Relation == "'='"]

#total removed with relation =! '=' 
total_removed3 = len(df3)-len(df4)

#check for unique values before deleting
df4['Relation'].unique()

array(["'='"], dtype=object)

In [44]:
#convert to pIC50
pic50 = []
for value in df4['IC50 (nM)']:
    ic50 = value / 1000
    pic50.append(-(math.log10(ic50*10**-6)))

df4['pIC50 (uM)'] = pic50

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['pIC50 (uM)'] = pic50


In [45]:
#remove stereoisomers 
smiles = [smiles_preparator(str(smile)) for smile in df4['SMILES']]
df4['SMILES_no_stereo'] = smiles

#remove relation column
df4 = df4.drop(columns = 'Relation')
df4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['SMILES_no_stereo'] = smiles


Unnamed: 0,ID,SMILES,IC50 (nM),pIC50 (uM),SMILES_no_stereo
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,645.00,6.190440,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,21400.00,4.669586,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,20800.00,4.681937,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,47600.00,4.322393,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,16940.00,4.771087,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[CH]3C)nc21
...,...,...,...,...,...
5114,CHEMBL4634636,NC(=O)[C@@H]1CC[C@H](c2ccc(OCc3ccccc3F)cc2F)N1,12589.25,4.900000,NC(=O)[CH]1CC[CH](c2ccc(OCc3ccccc3F)cc2F)N1
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,800.00,6.096910,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,40000.00,4.397940,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,26000.00,4.585027,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...


**REMOVE SALTS AND INVALID SMILES**

In [46]:
#remove salts
wrongSmiles = []
new_smiles = []
indexDropList_salts = []
for index, smile in enumerate(df4['SMILES_no_stereo']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        if remov.GetNumAtoms() <= 2:
            indexDropList_salts.append(index)
        else:
            new_smiles.append(Chem.MolToSmiles(remov, kekuleSmiles=True))
        
    except:
        wrongSmiles.append(df4.iloc[[index]])
        indexDropList_salts.append(index)


if len(wrongSmiles) == 0:
    print("no wrong smiles found")
    
else:
    #drop wrong smiles
    df4 = df4.drop(df4.index[indexDropList_salts])
    
    print(f"{len(indexDropList_salts)} wrong smiles found")
    
    #save removes mixtures
    wrongsmiles = pd.concat(wrongSmiles)
    wrongsmiles.to_csv(f'{savepath}\\wrongsmiles.csv', sep=',', header=True, index=False)
df4['SMILES_no_salts'] = new_smiles
df4 

no wrong smiles found


Unnamed: 0,ID,SMILES,IC50 (nM),pIC50 (uM),SMILES_no_stereo,SMILES_no_salts
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,645.00,6.190440,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,CNCC1=CC=C(Cl)C=C1OC1=CC=C(Cl)C=C1
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,21400.00,4.669586,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,C1=CC=C(C(CC2=CC=CC3=C2OCC3)N2CCNCC2)C=C1
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,20800.00,4.681937,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,COC1=C(F)C=CC=C1CC(C1=CC=CC=C1)N1CCNCC1
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,47600.00,4.322393,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,COC1=CC=CC=C1N1CCN(CC2=NN3C(N)=NC4=C(OC)C=CC=C...
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,16940.00,4.771087,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[CH]3C)nc21,COC1=CC=CC2=C1N=C(N)N1N=C(CN3CCN(C4=CC=CC=C4)C...
...,...,...,...,...,...,...
5114,CHEMBL4634636,NC(=O)[C@@H]1CC[C@H](c2ccc(OCc3ccccc3F)cc2F)N1,12589.25,4.900000,NC(=O)[CH]1CC[CH](c2ccc(OCc3ccccc3F)cc2F)N1,NC(=O)C1CCC(C2=CC=C(OCC3=CC=CC=C3F)C=C2F)N1
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,800.00,6.096910,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,O=C(O)CC1CCC2(CCN(C3=CC=C(C4=NC5=CC(C(F)(F)F)=...
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,40000.00,4.397940,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,CC(C)N1CC2=CC(C3=CC(C4=CC=C(Cl)N=C4)=NO3)=CC=C...
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,26000.00,4.585027,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,CCOC1=CC(C(=O)NC2=CC(C(F)(F)F)=CC=N2)=CC=C1C1=...


**REMOVE ORGANOMETALLICS**

In [47]:
organometals = []
indexDropList_org = []
for index, smile in enumerate(df4['SMILES_no_salts']):
    if is_metal(smile) == True:
        organometals.append(df4.iloc[[index]])
        indexDropList_org.append(index)

if len(indexDropList_org) == 0:
    print("no organometallics found")
    
else:
    #drop organometallics
    df4 = df4.drop(df4.index[indexDropList_org])
    
    print(f"{len(indexDropList_org)} organometallics found")
    
    #save droped organometallics
    organmetal = pd.concat(organometals)
    organmetal.to_csv(f'{savepath}\\organometallics.csv', sep=',', header=True, index=False)
df4

no organometallics found


Unnamed: 0,ID,SMILES,IC50 (nM),pIC50 (uM),SMILES_no_stereo,SMILES_no_salts
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,645.00,6.190440,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,CNCC1=CC=C(Cl)C=C1OC1=CC=C(Cl)C=C1
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,21400.00,4.669586,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,C1=CC=C(C(CC2=CC=CC3=C2OCC3)N2CCNCC2)C=C1
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,20800.00,4.681937,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,COC1=C(F)C=CC=C1CC(C1=CC=CC=C1)N1CCNCC1
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,47600.00,4.322393,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,COC1=CC=CC=C1N1CCN(CC2=NN3C(N)=NC4=C(OC)C=CC=C...
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,16940.00,4.771087,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[CH]3C)nc21,COC1=CC=CC2=C1N=C(N)N1N=C(CN3CCN(C4=CC=CC=C4)C...
...,...,...,...,...,...,...
5114,CHEMBL4634636,NC(=O)[C@@H]1CC[C@H](c2ccc(OCc3ccccc3F)cc2F)N1,12589.25,4.900000,NC(=O)[CH]1CC[CH](c2ccc(OCc3ccccc3F)cc2F)N1,NC(=O)C1CCC(C2=CC=C(OCC3=CC=CC=C3F)C=C2F)N1
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,800.00,6.096910,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,O=C(O)CC1CCC2(CCN(C3=CC=C(C4=NC5=CC(C(F)(F)F)=...
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,40000.00,4.397940,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,CC(C)N1CC2=CC(C3=CC(C4=CC=C(Cl)N=C4)=NO3)=CC=C...
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,26000.00,4.585027,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,CCOC1=CC(C(=O)NC2=CC(C(F)(F)F)=CC=N2)=CC=C1C1=...


**REMOVE MIXTURES**

In [48]:
#remove mixtures
mixtureList = []
indexDropList_mix = []
for index, smile in enumerate (df4['SMILES_no_salts']):
    for char in smile:
        if char == '.':
            mixtureList.append(df4.iloc[[index]])
            indexDropList_mix.append(index)
            break

            
if len(indexDropList_mix) == 0:
    print("no mixtures found")
    
else:
    #drop mixtures
    df4 = df4.drop(df4.index[indexDropList_mix])
    
    print(f"{len(indexDropList_mix)} mixtures found")
    
    #save removes mixtures
    mixtures = pd.concat(mixtureList)
    mixtures.to_csv(f'{savepath}\\mixtures.csv', sep=',', header=True, index=False)
df4 

1 mixtures found


Unnamed: 0,ID,SMILES,IC50 (nM),pIC50 (uM),SMILES_no_stereo,SMILES_no_salts
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,645.00,6.190440,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,CNCC1=CC=C(Cl)C=C1OC1=CC=C(Cl)C=C1
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,21400.00,4.669586,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,C1=CC=C(C(CC2=CC=CC3=C2OCC3)N2CCNCC2)C=C1
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,20800.00,4.681937,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,COC1=C(F)C=CC=C1CC(C1=CC=CC=C1)N1CCNCC1
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,47600.00,4.322393,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,COC1=CC=CC=C1N1CCN(CC2=NN3C(N)=NC4=C(OC)C=CC=C...
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,16940.00,4.771087,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[CH]3C)nc21,COC1=CC=CC2=C1N=C(N)N1N=C(CN3CCN(C4=CC=CC=C4)C...
...,...,...,...,...,...,...
5114,CHEMBL4634636,NC(=O)[C@@H]1CC[C@H](c2ccc(OCc3ccccc3F)cc2F)N1,12589.25,4.900000,NC(=O)[CH]1CC[CH](c2ccc(OCc3ccccc3F)cc2F)N1,NC(=O)C1CCC(C2=CC=C(OCC3=CC=CC=C3F)C=C2F)N1
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,800.00,6.096910,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,O=C(O)CC1CCC2(CCN(C3=CC=C(C4=NC5=CC(C(F)(F)F)=...
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,40000.00,4.397940,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,CC(C)N1CC2=CC(C3=CC(C4=CC=C(Cl)N=C4)=NO3)=CC=C...
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,26000.00,4.585027,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,CCOC1=CC(C(=O)NC2=CC(C(F)(F)F)=CC=N2)=CC=C1C1=...


**STANDARDISE**

In [49]:
"""
    -Standardize unknown stereochemistry (Handled by the RDKit Mol file parser)
        Fix wiggly bonds on sp3 carbons - sets atoms and bonds marked as unknown stereo to no stereo
        Fix wiggly bonds on double bonds – set double bond to crossed bond
    -Clears S Group data from the mol file
    -Kekulize the structure
    -Remove H atoms (See the page on explicit Hs for more details)
    -Normalization:
        Fix hypervalent nitro groups
        Fix KO to K+ O- and NaO to Na+ O- (Also add Li+ to this)
        Correct amides with N=COH
        Standardise sulphoxides to charge separated form
        Standardize diazonium N (atom :2 here: [*:1]-[N;X2:2]#[N;X1:3]>>[*:1]) to N+
        Ensure quaternary N is charged
        Ensure trivalent O ([*:1]=[O;X2;v3;+0:2]-[#6:3]) is charged
        Ensure trivalent S ([O:1]=[S;D2;+0:2]-[#6:3]) is charged
        Ensure halogen with no neighbors ([F,Cl,Br,I;X0;+0:1]) is charged
    -The molecule is neutralized, if possible. See the page on neutralization rules for more details.
    -Remove stereo from tartrate to simplify salt matching
    -Normalise (straighten) triple bonds and allenes
    
    https://github.com/chembl/ChEMBL_Structure_Pipeline
"""

rdMol = [Chem.MolFromSmiles(smile, sanitize=True) for smile in df4['SMILES_no_salts']]

molBlock = [Chem.MolToMolBlock(mol) for mol in rdMol]

stdMolBlock = [standardizer.standardize_molblock(mol_block) for mol_block in molBlock]

molFromMolBlock = [Chem.MolFromMolBlock(std_molblock) for std_molblock in stdMolBlock]

mol2smiles = [Chem.MolToSmiles(m) for m in molFromMolBlock]

df4['final_smiles'] = mol2smiles

**Remove Salts for the second time**

In [51]:
#remove salts second time
wrongSmiles = []
new_smiles = []
indexDropList_salts = []
for index, smile in enumerate(df4['final_smiles']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        if remov.GetNumAtoms() <= 2:
            indexDropList_salts.append(index)
        else:
            new_smiles.append(Chem.MolToSmiles(remov, kekuleSmiles=True))
        
    except:
        wrongSmiles.append(df4.iloc[[index]])
        indexDropList_salts.append(index)


if len(wrongSmiles) == 0:
    print("no wrong smiles found")
    
else:
    #drop wrong smiles
    df4 = df4.drop(df4.index[indexDropList_salts])
    
    print(f"{len(indexDropList_salts)} wrong smiles found")
    
    #save removes mixtures
    wrongsmiles = pd.concat(wrongSmiles)
    wrongsmiles.to_csv(f'{savepath}\\wrongsmiles_after_std.csv', sep=',', header=True, index=False)
df4['final_smiles_stand'] = new_smiles
df4 

no wrong smiles found


Unnamed: 0,ID,SMILES,IC50 (nM),pIC50 (uM),SMILES_no_stereo,SMILES_no_salts,final_smiles,final_smiles_stand
0,CHEMBL462387,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,645.00,6.190440,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,CNCC1=CC=C(Cl)C=C1OC1=CC=C(Cl)C=C1,CNCc1ccc(Cl)cc1Oc1ccc(Cl)cc1,CNCC1=CC=C(Cl)C=C1OC1=CC=C(Cl)C=C1
1,CHEMBL1083707,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,21400.00,4.669586,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,C1=CC=C(C(CC2=CC=CC3=C2OCC3)N2CCNCC2)C=C1,c1ccc(C(Cc2cccc3c2OCC3)N2CCNCC2)cc1,C1=CC=C(C(CC2=CC=CC3=C2OCC3)N2CCNCC2)C=C1
2,CHEMBL1086273,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,20800.00,4.681937,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,COC1=C(F)C=CC=C1CC(C1=CC=CC=C1)N1CCNCC1,COc1c(F)cccc1CC(c1ccccc1)N1CCNCC1,COC1=C(F)C=CC=C1CC(C1=CC=CC=C1)N1CCNCC1
3,CHEMBL3596511,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,47600.00,4.322393,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)[C...,COC1=CC=CC=C1N1CCN(CC2=NN3C(N)=NC4=C(OC)C=CC=C...,COc1ccccc1N1CCN(Cc2nc3c4cccc(OC)c4nc(N)n3n2)C(...,COC1=CC=CC=C1N1CCN(CC2=NN3C(N)=NC4=C(OC)C=CC=C...
4,CHEMBL3596506,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[C@H]3C)...,16940.00,4.771087,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)C[CH]3C)nc21,COC1=CC=CC2=C1N=C(N)N1N=C(CN3CCN(C4=CC=CC=C4)C...,COc1cccc2c1nc(N)n1nc(CN3CCN(c4ccccc4)CC3C)nc21,COC1=CC=CC2=C1N=C(N)N1N=C(CN3CCN(C4=CC=CC=C4)C...
...,...,...,...,...,...,...,...,...
5114,CHEMBL4634636,NC(=O)[C@@H]1CC[C@H](c2ccc(OCc3ccccc3F)cc2F)N1,12589.25,4.900000,NC(=O)[CH]1CC[CH](c2ccc(OCc3ccccc3F)cc2F)N1,NC(=O)C1CCC(C2=CC=C(OCC3=CC=CC=C3F)C=C2F)N1,NC(=O)C1CCC(c2ccc(OCc3ccccc3F)cc2F)N1,NC(=O)C1CCC(C2=CC=C(OCC3=CC=CC=C3F)C=C2F)N1
5115,CHEMBL4060355,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,800.00,6.096910,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,O=C(O)CC1CCC2(CCN(C3=CC=C(C4=NC5=CC(C(F)(F)F)=...,O=C(O)CC1CCC2(CCN(c3ccc(-c4nc5cc(C(F)(F)F)ccc5...,O=C(O)CC1CCC2(CCN(C3=CC=C(C4=NC5=CC(C(F)(F)F)=...
5116,CHEMBL3398278,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,40000.00,4.397940,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,CC(C)N1CC2=CC(C3=CC(C4=CC=C(Cl)N=C4)=NO3)=CC=C...,CC(C)N1Cc2cc(-c3cc(-c4ccc(Cl)nc4)no3)ccc2C1=O,CC(C)N1CC2=CC(C3=CC(C4=CC=C(Cl)N=C4)=NO3)=CC=C...
5118,CHEMBL4097019,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,26000.00,4.585027,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,CCOC1=CC(C(=O)NC2=CC(C(F)(F)F)=CC=N2)=CC=C1C1=...,CCOc1cc(C(=O)Nc2cc(C(F)(F)F)ccn2)ccc1-c1nc(C23...,CCOC1=CC(C(=O)NC2=CC(C(F)(F)F)=CC=N2)=CC=C1C1=...


In [None]:
df4.to_csv(f'{savepath}\\df4_test.csv', sep=',', header=True, index=False)

**DUPLICATE REMOVAL**

In [169]:
inchikey_list = [rd_inchi.MolToInchiKey(mol) for mol in molFromMolBlock]
df4['InChIKey'] = inchikey_list

In [165]:
#curation summary
columnName = ['Removed w/n activity value', 'Removed w/n relation value', 'Removed w/ relation != "="', 'invalid smiles removed', 'organometallics', 'mixtures']
valueName = [total_removed1, total_removed2, total_removed3, len(indexDropList_salts), len(indexDropList_org), len(indexDropList_mix)]
pd.DataFrame(valueName, columnName)

Unnamed: 0,0
Removed w/n activity value,276
Removed w/n relation value,0
"Removed w/ relation != ""=""",1309
invalid smiles removed,0
organometallics,0
mixtures,1


In [52]:
df4.to_csv(f'{savepath}\\df4_test.csv', sep=',', header=True, index=False)