In [45]:
#import libraries and dependencies

import pandas as pd
import math
import numpy as np



from rdkit import Chem
from chembl_structure_pipeline import standardizer
from rdkit.Chem.MolStandardize.metal import MetalDisconnector
import rdkit.Chem.MolStandardize.rdMolStandardize as rdMolStandardize
from rdkit.Chem import PandasTools

from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import inchi as rd_inchi

from molvs import standardize_smiles
from molvs import Standardizer
from rdkit.Chem import Draw

In [46]:
#choose a path to save
savepath = r'D:\python\jupyterscripts\curation\natura'

In [47]:
#utils
def metal_atomic_numbers(at):
    """ This function checks the atomic number of an atom """
    
    n = at.GetAtomicNum()
    return (n==13) or (n>=21 and n<=31) or (n>=39 and n<=50) or (n>=57 and n<=83) or (n>=89 and n<=115)

def is_metal(smile):
    """ This function checks if an atom is a metal based on its atomic number """
    mol = Chem.MolFromSmiles(smile)
    rwmol = Chem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)
    metal = [at.GetSymbol() for at in rwmol.GetAtoms() if metal_atomic_numbers(at)]
    return len(metal) == 1

def smiles_preparator(smiles):
    """ This function prepares smiles by removing odd signs """
    smiles = smiles.replace('@','')
    return str(smiles)

def salt_remover(mol):
    """ This function removes salts, see complete list of possible salts in https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt """
    remover = SaltRemover(defnData=None)
    stripped = remover.StripMol(mol, dontRemoveEverything=True)
    return Chem.MolToSmiles(stripped)
    

In [48]:
df3 = pd.read_csv(r'D:\python\jupyterscripts\curation\natura\data_natura.csv', encoding='latin-1')
df3

Unnamed: 0,CAS,SMILES,TA1535,TA1537 ou TA97a ou TA97,TA98,TA100,WP2 ou TA102,TA1535 (S9+),TA1537 ou TA97a ou TA97(S9+),TA98 (S9+),TA100 (S9+),WP2 ou TA102 (S9+),Ames global,Source,Link
0,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,,,,,Negativo,Benchmark,
1,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,Negativo,Negativo,Negativo,,Negativo,OCHEM,A4542
2,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,Benchmark,
3,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,ECVAM,
4,99-99-0,CC1=CC=C(C=C1)N(=O)=O,,,,,,,Negativo,Negativo,,,Negativo,OCHEM,A3752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32056,,OC[C@H]1O[C@@H](OC2=CC=CC3=C2C(=O)C2=C(C=C(C=C...,,,,,,,,,,Positivo,Negativo,OCHEM,A4542
32057,,OC1=C(Cl)C(Cl)=C(Cl)C(Cl)=C1Cl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32058,,OC1=CC=CC=C1C1=CC=CC=C1,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32059,,OP(O)(=O)CCCl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722


In [19]:
"""#prepare smiles for furthur transformations
smiles = [smiles_preparator(str(smile)) for smile in df3['SMILES']]

#df3 = df3.drop(columns=["SMILES"])
df3['SMILES_new'] = smiles"""

In [49]:
df3

Unnamed: 0,CAS,SMILES,TA1535,TA1537 ou TA97a ou TA97,TA98,TA100,WP2 ou TA102,TA1535 (S9+),TA1537 ou TA97a ou TA97(S9+),TA98 (S9+),TA100 (S9+),WP2 ou TA102 (S9+),Ames global,Source,Link
0,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,,,,,Negativo,Benchmark,
1,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,Negativo,Negativo,Negativo,,Negativo,OCHEM,A4542
2,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,Benchmark,
3,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,ECVAM,
4,99-99-0,CC1=CC=C(C=C1)N(=O)=O,,,,,,,Negativo,Negativo,,,Negativo,OCHEM,A3752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32056,,OC[C@H]1O[C@@H](OC2=CC=CC3=C2C(=O)C2=C(C=C(C=C...,,,,,,,,,,Positivo,Negativo,OCHEM,A4542
32057,,OC1=C(Cl)C(Cl)=C(Cl)C(Cl)=C1Cl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32058,,OC1=CC=CC=C1C1=CC=CC=C1,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32059,,OP(O)(=O)CCCl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722


**REMOVE SALTS, MIXTURES AND ORGANOMETALLICS**

In [51]:
#remove salts
wrongSmiles = []
new_smiles = []
indexDropList = []
for index, smile in enumerate(df3['SMILES']):
    try:
        mol = Chem.MolFromSmiles(smile)
        remov = salt_remover(mol)
        new_smiles.append(remov)
    except:
        wrongSmiles.append(df3.iloc[[index]])
        indexDropList.append(index)
    
df3 = df3.drop(df3.index[indexDropList])
df3

Unnamed: 0,CAS,SMILES,TA1535,TA1537 ou TA97a ou TA97,TA98,TA100,WP2 ou TA102,TA1535 (S9+),TA1537 ou TA97a ou TA97(S9+),TA98 (S9+),TA100 (S9+),WP2 ou TA102 (S9+),Ames global,Source,Link
0,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,,,,,Negativo,Benchmark,
1,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,Negativo,Negativo,Negativo,,Negativo,OCHEM,A4542
2,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,Benchmark,
3,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,ECVAM,
4,99-99-0,CC1=CC=C(C=C1)N(=O)=O,,,,,,,Negativo,Negativo,,,Negativo,OCHEM,A3752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32056,,OC[C@H]1O[C@@H](OC2=CC=CC3=C2C(=O)C2=C(C=C(C=C...,,,,,,,,,,Positivo,Negativo,OCHEM,A4542
32057,,OC1=C(Cl)C(Cl)=C(Cl)C(Cl)=C1Cl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32058,,OC1=CC=CC=C1C1=CC=CC=C1,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32059,,OP(O)(=O)CCCl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722


In [52]:
#remove organlometallics
organometals = []
indexDropList = []
for index, smile in enumerate(df3['SMILES']):
    if is_metal(smile) == True:
        organometals.append(df3.iloc[[index]])
        indexDropList.append(index)

#drop organometallics
df3 = df3.drop(df3.index[indexDropList])

#save droped organometallics
organmetal = pd.concat(organometals)
organmetal.to_csv(f'{savepath}\\organometallics.csv', sep=',', header=True, index=False)

print(f'Organometallics removed: {len(organmetal)}')
df3

Organometallics removed: 650


Unnamed: 0,CAS,SMILES,TA1535,TA1537 ou TA97a ou TA97,TA98,TA100,WP2 ou TA102,TA1535 (S9+),TA1537 ou TA97a ou TA97(S9+),TA98 (S9+),TA100 (S9+),WP2 ou TA102 (S9+),Ames global,Source,Link
0,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,,,,,Negativo,Benchmark,
1,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,Negativo,Negativo,Negativo,,Negativo,OCHEM,A4542
2,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,Benchmark,
3,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,ECVAM,
4,99-99-0,CC1=CC=C(C=C1)N(=O)=O,,,,,,,Negativo,Negativo,,,Negativo,OCHEM,A3752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32056,,OC[C@H]1O[C@@H](OC2=CC=CC3=C2C(=O)C2=C(C=C(C=C...,,,,,,,,,,Positivo,Negativo,OCHEM,A4542
32057,,OC1=C(Cl)C(Cl)=C(Cl)C(Cl)=C1Cl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32058,,OC1=CC=CC=C1C1=CC=CC=C1,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32059,,OP(O)(=O)CCCl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722


In [53]:
#remove mixtures

mixtureList = []
indexDropList = []
for index, smile in enumerate (df3['SMILES']):
    for char in smile:
        if char == '.':
            mixtureList.append(df3.iloc[[index]])
            indexDropList.append(index)
            break

            
if len(indexDropList) == 0:
    print("no mixtures found")
    
else:
    #drop mixtures
    df3 = df3.drop(df3.index[indexDropList])
    
    print(f"{len(indexDropList)} mixtures found")
    
    #save removes mixtures
    mixtures = pd.concat(mixtureList)
    mixtures.to_csv(f'{savepath}\\mixtures.csv', sep=',', header=True, index=False)
df3 

1522 mixtures found


Unnamed: 0,CAS,SMILES,TA1535,TA1537 ou TA97a ou TA97,TA98,TA100,WP2 ou TA102,TA1535 (S9+),TA1537 ou TA97a ou TA97(S9+),TA98 (S9+),TA100 (S9+),WP2 ou TA102 (S9+),Ames global,Source,Link
0,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,,,,,Negativo,Benchmark,
1,999-97-3,C[Si](C)(C)N[Si](C)(C)C,,,,,,,Negativo,Negativo,Negativo,,Negativo,OCHEM,A4542
2,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,Benchmark,
3,99-99-0,Cc1ccc(cc1)[N+](=O)[O-],,,,,,,,,,,Negativo,ECVAM,
4,99-99-0,CC1=CC=C(C=C1)N(=O)=O,,,,,,,Negativo,Negativo,,,Negativo,OCHEM,A3752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32056,,OC[C@H]1O[C@@H](OC2=CC=CC3=C2C(=O)C2=C(C=C(C=C...,,,,,,,,,,Positivo,Negativo,OCHEM,A4542
32057,,OC1=C(Cl)C(Cl)=C(Cl)C(Cl)=C1Cl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32058,,OC1=CC=CC=C1C1=CC=CC=C1,,,,,,,,,,Negativo,Negativo,OCHEM,A2722
32059,,OP(O)(=O)CCCl,,,,,,,,,,Negativo,Negativo,OCHEM,A2722


**Standardisation**

In [55]:
"""
    -Standardize unknown stereochemistry (Handled by the RDKit Mol file parser)
        Fix wiggly bonds on sp3 carbons - sets atoms and bonds marked as unknown stereo to no stereo
        Fix wiggly bonds on double bonds – set double bond to crossed bond
    -Clears S Group data from the mol file
    -Kekulize the structure
    -Remove H atoms (See the page on explicit Hs for more details)
    -Normalization:
        Fix hypervalent nitro groups
        Fix KO to K+ O- and NaO to Na+ O- (Also add Li+ to this)
        Correct amides with N=COH
        Standardise sulphoxides to charge separated form
        Standardize diazonium N (atom :2 here: [*:1]-[N;X2:2]#[N;X1:3]>>[*:1]) to N+
        Ensure quaternary N is charged
        Ensure trivalent O ([*:1]=[O;X2;v3;+0:2]-[#6:3]) is charged
        Ensure trivalent S ([O:1]=[S;D2;+0:2]-[#6:3]) is charged
        Ensure halogen with no neighbors ([F,Cl,Br,I;X0;+0:1]) is charged
    -The molecule is neutralized, if possible. See the page on neutralization rules for more details.
    -Remove stereo from tartrate to simplify salt matching
    -Normalise (straighten) triple bonds and allenes
"""

#standardise molecules
mol2stand = [standardizer.standardize_mol(mol) for mol in df3['rdMol']]

#convert molecules to smiles
standMol2smiles = [Chem.MolToSmiles(mol) for mol in mol2stand]

#drop old smiles and rdMol and update it 
df3 = df3.drop(columns=["SMILES"])
df3 = df3.drop(columns=["rdMol"]) 
df3['SMILES'] = standMol2smiles
df3['rdMol'] = mol2stand
df3

KeyError: 'rdMol'

**Standardisation troubleshooting**

In [None]:
"""FIRST TRY TO STANDARDISE"""
def standardize(smiles):
    mol = Chem.MolFromSmiles(smiles, sanitize = True)
     
    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol) 
     
    # if many fragments, get the "parent" (the actual mol we are interested in) 
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)
         
    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger() # annoying, but necessary as no convenience method exists
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)
     
    # note that no attempt is made at reionization at this step
    # nor at ionization at some pH (rdkit has no pKa caculator)
    # the main aim to to represent all molecules from different sources
    # in a (single) standard way, for use in ML, catalogue, etc.
     
    te = rdMolStandardize.TautomerEnumerator() # idem
    taut_uncharged_parent_clean_mol = te.Canonicalize(uncharged_parent_clean_mol)
     
    return taut_uncharged_parent_clean_mol

In [None]:
a = standardize('O[C@]1C2=c3c(cc4cccc5ccc(c3c54)[C@@]1O)-c1ccccc12')
b = standardize('OC1C2=c3c(cc4cccc5ccc(c3c54)C1O)-c1ccccc21')
c = standardize('O[C]1C2=c3c(cc4cccc5ccc(c3c54)[C]1O)-c1ccccc12')

print(rd_inchi.MolToInchiKey(a))
print(rd_inchi.MolToInchiKey(b))
print(rd_inchi.MolToInchiKey(c))

In [None]:
"""SECOND TRY TO STANDARDISE"""

mol1 = Chem.MolFromSmiles('O[C@]1C2=c3c(cc4cccc5ccc(c3c54)[C@@]1O)-c1ccccc12')
mol_block1 = Chem.MolToMolBlock(mol1)

mol2 = Chem.MolFromSmiles('OC1C2=c3c(cc4cccc5ccc(c3c54)C1O)-c1ccccc21')
mol_block2 = Chem.MolToMolBlock(mol2)

mol3 = Chem.MolFromSmiles('O[C]1C2=c3c(cc4cccc5ccc(c3c54)[C]1O)-c1ccccc12')
mol_block3 = Chem.MolToMolBlock(mol3)

In [None]:
std_molblock1 = standardizer.standardize_molblock(mol_block1)
std_molblock2 = standardizer.standardize_molblock(mol_block2)
std_molblock3 = standardizer.standardize_molblock(mol_block3)

In [None]:
mol_final1 = Chem.MolFromMolBlock(std_molblock1)
mol_final2 = Chem.MolFromMolBlock(std_molblock2)
mol_final3 = Chem.MolFromMolBlock(std_molblock3)
Draw.MolsToGridImage([mol_final1, mol_final2, mol_final3])

In [None]:
print(rd_inchi.MolToInchiKey(mol_final1))
print(rd_inchi.MolToInchiKey(mol_final2))
print(rd_inchi.MolToInchiKey(mol_final3))

In [17]:
#save in csv
df3.to_csv(f'{savepath}\\standardized_molecules.csv', sep=',', header=True, index=False)