From f464d3ce713651197cf98f3a98c1879f85df2c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 5 Oct 2020 10:32:37 +0200 Subject: [PATCH 01/47] fix carbonyl --- package/MDAnalysis/coordinates/RDKit.py | 6 +++--- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 30175e7fbcf..f370c42e06e 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -735,7 +735,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): """ mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) - pattern = Chem.MolFromSmarts("[*-;!O]-[*+0]=[*+0]") + pattern = Chem.MolFromSmarts("[*-]-[*+0]=[*+0;!O]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) @@ -744,13 +744,13 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): return # check if there's an even number of anion-*=* patterns elif n_matches % 2 == 0: - end_pattern = Chem.MolFromSmarts("[*-;!O]-[*+0]=[*+0]-[*-]") + end_pattern = Chem.MolFromSmarts("[*-]-[*+0]=[*+0]-[*-]") else: # as a last resort, the only way to standardize is to find a nitrogen # that can accept a double bond and a positive charge # or a carbonyl that will become an enolate end_pattern = Chem.MolFromSmarts( - "[*-;!O]-[*+0]=[*+0]-[$([#7;X3;v3]),$([#6+0]=O)]") + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0]=O)]") backtrack = [] for _ in range(max_iter): # simplest case where n=1 diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 1fc1bea543d..41dc6da40db 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -490,6 +490,8 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "O=C([C@H](CC1=CNC=N1)N)O", "O=C([C@H](CC1=CN=CN1)N)O", "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + # fix conjugated carbonyl/carboxyl + "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From d129537244204a83860711c385b374fd34804100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 12 Oct 2020 19:27:34 +0200 Subject: [PATCH 02/47] fix charged mols --- package/MDAnalysis/coordinates/RDKit.py | 40 +++++++++++-------- .../MDAnalysisTests/coordinates/test_rdkit.py | 18 +++++++-- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index f370c42e06e..e625c0b2cc9 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -513,8 +513,10 @@ def _infer_bo_and_charges(mol): one will be charged. It will also affect more complex conjugated systems. """ - for atom in sorted(mol.GetAtoms(), reverse=True, - key=lambda a: _get_nb_unpaired_electrons(a)[0]): + atoms = sorted([a for a in mol.GetAtoms() if a.GetAtomicNum() > 1], + reverse=True, + key=lambda a: _get_nb_unpaired_electrons(a)[0]) + for atom in atoms: # get NUE for each possible valence nue = _get_nb_unpaired_electrons(atom) # if there's only one possible valence state and the corresponding @@ -550,11 +552,11 @@ def _infer_bo_and_charges(mol): # recalculate nue for atom nue = _get_nb_unpaired_electrons(atom) - # if the atom still has unpaired electrons - nue = _get_nb_unpaired_electrons(atom)[0] - if nue > 0: - # transform it to a negative charge - atom.SetFormalCharge(-nue) + # if atom valence is still not filled + nue_list = _get_nb_unpaired_electrons(atom) + if not any(nue == 0 for nue in nue_list): + # transform nue to charge + atom.SetFormalCharge(-nue_list[0]) atom.SetNumRadicalElectrons(0) mol.UpdatePropertyCache(strict=False) @@ -611,19 +613,19 @@ def _standardize_patterns(mol, max_iter=200): +---------------+------------------------------------------------------------------------------+ | conjugated-O- | ``[O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | +---------------+------------------------------------------------------------------------------+ - | Cterm | ``[C-;X2:1]=[O:2]>>[C+0:1]=[O:2]`` | + | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | +---------------+------------------------------------------------------------------------------+ | Nterm | ``[N-;X2;H1:1]>>[N+0:1]`` | +---------------+------------------------------------------------------------------------------+ | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | +---------------+------------------------------------------------------------------------------+ - | arginine | ``[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4]`` | + | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | +---------------+------------------------------------------------------------------------------+ | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| +---------------+------------------------------------------------------------------------------+ | sulfone | ``[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3]`` | +---------------+------------------------------------------------------------------------------+ - | nitro | ``[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3]`` | + | charged N | ``[#7+0;v3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | +---------------+------------------------------------------------------------------------------+ """ @@ -635,17 +637,15 @@ def _standardize_patterns(mol, max_iter=200): for reactant in Chem.GetMolFrags(mol, asMols=True): for name, reaction in [ - ("Cterm", "[C-;X2:1]=[O:2]>>[C+0:1]=[O:2]"), + ("Cterm", "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]"), ("Nterm", "[N-;X2;H1:1]>>[N+0:1]"), ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), - ("ARG", "[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]" - ">>[N:1]-[C+0:2](-[N:3])=[N+:4]"), + ("ARG", "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]"), ("HIP", "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]"), ("sulfone", "[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]" ">>[S:1](=[O+0:2])=[O+0:3]"), - ("nitro", "[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]" - ">>[N+:1](-[O-:2])=[O+0:3]"), + ("charged-N", "[#7+0;v3:1]-[*-:2]>>[#7+:1]=[*+0:2]"), ]: reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) @@ -789,7 +789,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # check if we haven't already transformed this triplet for match in matches: # sort the indices for the comparison - g = tuple(sorted(match)) + g = set(match) if g in backtrack: # already transformed continue @@ -799,7 +799,13 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): backtrack.append(g) break else: - anion, a1, a2 = matches[0] + # tried all paths and now blocked from backtracking + warnings.warn("The standardization got stuck in a dead-end, " + "likely because of a charge in a conjugated " + "system. Please check the output molecule " + "carefully as it is likely to be an unreasonable" + " resonance structure") + return # charges mol.GetAtomWithIdx(anion).SetFormalCharge(0) mol.GetAtomWithIdx(a2).SetFormalCharge(-1) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 41dc6da40db..017954390f9 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -490,8 +490,11 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "O=C([C@H](CC1=CNC=N1)N)O", "O=C([C@H](CC1=CN=CN1)N)O", "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - # fix conjugated carbonyl/carboxyl + # fixes from PR #??? "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", + "[O-][n+]1cccnc1", + "C[n+]1ccccc1", + "[PH4+]", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders @@ -516,9 +519,16 @@ def test_order_independant(self, smi_in): m = _standardize_patterns(m) Chem.SanitizeMol(m) m = Chem.RemoveHs(m) - assert m.HasSubstructMatch(ref) and ref.HasSubstructMatch( - m), (f"(input) {Chem.MolToSmiles(ref)} != " - f"{Chem.MolToSmiles(m)} (output) root atom {a.GetIdx()}") + match = m.HasSubstructMatch(ref) and ref.HasSubstructMatch(m) + if not match: + # try resonance structures for charged conjugated systems + for mol in Chem.ResonanceMolSupplier(m, maxStructs=20): + match = mol.HasSubstructMatch(ref) and ref.HasSubstructMatch(mol) + if match: + break + assert match, (f"(input) {Chem.MolToSmiles(ref)} != " + f"{Chem.MolToSmiles(m)} (output) " + f"root atom {a.GetIdx()}") def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" From 24ddb3ead17fc15b6d7d482f1c1aaf900ff9f077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 14 Oct 2020 17:59:03 +0200 Subject: [PATCH 03/47] remove false premature return --- package/MDAnalysis/coordinates/RDKit.py | 272 ++++++++++++------------ 1 file changed, 133 insertions(+), 139 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index e625c0b2cc9..81e24fcb6ec 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -67,6 +67,7 @@ import warnings import re import copy +from functools import lru_cache import numpy as np @@ -244,19 +245,13 @@ class RDKitConverter(base.ConverterBase): Since one of the main use case of the converter is converting trajectories and not just a topology, creating a new molecule from scratch for every frame would be too slow so the converter uses a caching system. The cache - only remembers the id of the last AtomGroup that was converted, as well - as the arguments that were passed to the converter. This means that using - ``u.select_atoms("protein").convert_to("RDKIT")`` will not benefit from the - cache since the selection is deleted from memory as soon as the conversion - is finished. Instead, users should do this in two steps by first saving the - selection in a variable and then converting the saved AtomGroup. It also - means that ``ag.convert_to("RDKIT")`` followed by - ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the cache. - Finally if you're modifying the AtomGroup in place between two conversions, - the id of the AtomGroup won't change and thus the converter will use the - cached molecule. For this reason, you can pass a ``cache=False`` argument - to the converter to bypass the caching system. - Note that the cached molecule doesn't contain the coordinates of the atoms. + only remembers the 2 most recent AtomGroups that were converted, as well + as the arguments that were passed to the converter. The number of objects + cached can be changed with the function :func:`set_converter_cache_size`. + However, ``ag.convert_to("RDKIT")`` followed by ``ag.convert_to("RDKIT", NoImplicit=False)`` + will not use the cache since the arguments given are different. + You can pass a ``cache=False`` argument to the converter to bypass the + caching system. .. versionadded:: 2.0.0 @@ -265,7 +260,6 @@ class RDKitConverter(base.ConverterBase): lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} - _cache = dict() def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): """Write selection at current trajectory frame to @@ -278,16 +272,14 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the - same AtomGroup object (same id) and with the same arguments passed - to the converter (with the exception of this `cache` argument) + same AtomGroup selection and with the same arguments passed + to the converter NoImplicit : bool Prevent adding hydrogens to the molecule max_iter : int Maximum number of iterations to standardize conjugated systems. See :func:`_rebuild_conjugated_bonds` """ - # parameters passed to atomgroup_to_mol and used by the cache - kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter) try: from rdkit import Chem @@ -303,22 +295,13 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): "please use a valid AtomGroup or Universe".format( type(obj))) from None + # parameters passed to atomgroup_to_mol + kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter) if cache: - # key used to search the cache - key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}" - for key, value in kwargs.items()) - try: - mol = self._cache[key] - except KeyError: - # only keep the current molecule in cache - self._cache.clear() - # create the topology - self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs) - # continue on copy of the cached molecule + mol = atomgroup_to_mol(ag, **kwargs) mol = copy.deepcopy(mol) else: - self._cache.clear() - mol = self.atomgroup_to_mol(ag, **kwargs) + mol = atomgroup_to_mol.__wrapped__(ag, **kwargs) # add a conformer for the current Timestep if hasattr(ag, "positions"): @@ -339,108 +322,124 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): return mol - def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200): - """Converts an AtomGroup to an RDKit molecule. - Parameters - ----------- - ag : MDAnalysis.core.groups.AtomGroup - The AtomGroup to convert - NoImplicit : bool - Prevent adding hydrogens to the molecule - max_iter : int - Maximum number of iterations to standardize conjugated systems. - See :func:`_rebuild_conjugated_bonds` - """ - try: - elements = ag.elements - except NoDataError: - raise AttributeError( - "The `elements` attribute is required for the RDKitConverter " - "but is not present in this AtomGroup. Please refer to the " - "documentation to guess elements from other attributes or " - "type `help(mda.topology.guessers)`") from None - - if "H" not in ag.elements: - warnings.warn( - "No hydrogen atom could be found in the topology, but the " - "converter requires all hydrogens to be explicit. Please " - "check carefully the output molecule as the converter is " - "likely to add negative charges and assign incorrect bond " - "orders to structures with implicit hydrogens. Alternatively, " - "you can use the parameter `NoImplicit=False` when using the " - "converter to allow implicit hydrogens and disable inferring " - "bond orders and charges." - ) - - # attributes accepted in PDBResidueInfo object - pdb_attrs = {} - if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): - raise AttributeError( - "Both `tempfactors` and `bfactors` attributes are present but " - "only one can be assigned to the RDKit molecule. Please " - "delete the unnecessary one and retry." - ) - for attr in RDATTRIBUTES.keys(): - if hasattr(ag, attr): - pdb_attrs[attr] = getattr(ag, attr) - - other_attrs = {} - for attr in ["charges", "segids", "types"]: - if hasattr(ag, attr): - other_attrs[attr] = getattr(ag, attr) - - mol = Chem.RWMol() - # map index in universe to index in mol - atom_mapper = {} - - for i, (atom, element) in enumerate(zip(ag, elements)): - # create atom - rdatom = Chem.Atom(element.capitalize()) - # enable/disable adding implicit H to the molecule - rdatom.SetNoImplicit(NoImplicit) - # add PDB-like properties - mi = Chem.AtomPDBResidueInfo() - for attr, values in pdb_attrs.items(): - _add_mda_attr_to_rdkit(attr, values[i], mi) - rdatom.SetMonomerInfo(mi) - # other properties - for attr in other_attrs.keys(): - value = other_attrs[attr][i] - attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular - _set_atom_property(rdatom, attr, value) - _set_atom_property(rdatom, "_MDAnalysis_index", i) - # add atom - index = mol.AddAtom(rdatom) - atom_mapper[atom.ix] = index +@lru_cache(maxsize=2) +def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200): + """Converts an AtomGroup to an RDKit molecule without coordinates. + Parameters + ----------- + ag : MDAnalysis.core.groups.AtomGroup + The AtomGroup to convert + NoImplicit : bool + Prevent adding hydrogens to the molecule + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :func:`_rebuild_conjugated_bonds` + """ + try: + elements = ag.elements + except NoDataError: + raise AttributeError( + "The `elements` attribute is required for the RDKitConverter " + "but is not present in this AtomGroup. Please refer to the " + "documentation to guess elements from other attributes or " + "type `help(mda.topology.guessers)`") from None + + if "H" not in ag.elements: + warnings.warn( + "No hydrogen atom could be found in the topology, but the " + "converter requires all hydrogens to be explicit. Please " + "check carefully the output molecule as the converter is " + "likely to add negative charges and assign incorrect bond " + "orders to structures with implicit hydrogens. Alternatively, " + "you can use the parameter `NoImplicit=False` when using the " + "converter to allow implicit hydrogens and disable inferring " + "bond orders and charges." + ) + + # attributes accepted in PDBResidueInfo object + pdb_attrs = {} + if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): + raise AttributeError( + "Both `tempfactors` and `bfactors` attributes are present but " + "only one can be assigned to the RDKit molecule. Please " + "delete the unnecessary one and retry." + ) + for attr in RDATTRIBUTES.keys(): + if hasattr(ag, attr): + pdb_attrs[attr] = getattr(ag, attr) + + other_attrs = {} + for attr in ["charges", "segids", "types"]: + if hasattr(ag, attr): + other_attrs[attr] = getattr(ag, attr) + + mol = Chem.RWMol() + # map index in universe to index in mol + atom_mapper = {} + + for i, (atom, element) in enumerate(zip(ag, elements)): + # create atom + rdatom = Chem.Atom(element.capitalize()) + # enable/disable adding implicit H to the molecule + rdatom.SetNoImplicit(NoImplicit) + # add PDB-like properties + mi = Chem.AtomPDBResidueInfo() + for attr, values in pdb_attrs.items(): + _add_mda_attr_to_rdkit(attr, values[i], mi) + rdatom.SetMonomerInfo(mi) + # other properties + for attr in other_attrs.keys(): + value = other_attrs[attr][i] + attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular + _set_atom_property(rdatom, attr, value) + _set_atom_property(rdatom, "_MDAnalysis_index", i) + # add atom + index = mol.AddAtom(rdatom) + atom_mapper[atom.ix] = index + + try: + ag.bonds + except NoDataError: + warnings.warn( + "No `bonds` attribute in this AtomGroup. Guessing bonds based " + "on atoms coordinates") + ag.guess_bonds() + + for bond in ag.bonds: try: - ag.bonds - except NoDataError: - warnings.warn( - "No `bonds` attribute in this AtomGroup. Guessing bonds based " - "on atoms coordinates") - ag.guess_bonds() - - for bond in ag.bonds: - try: - bond_indices = [atom_mapper[i] for i in bond.indices] - except KeyError: - continue - bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) - mol.AddBond(*bond_indices, bond_type) - - mol.UpdatePropertyCache(strict=False) - - if NoImplicit: - # infer bond orders and formal charges from the connectivity - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol, max_iter) - - # sanitize - Chem.SanitizeMol(mol) + bond_indices = [atom_mapper[i] for i in bond.indices] + except KeyError: + continue + bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) + mol.AddBond(*bond_indices, bond_type) + + mol.UpdatePropertyCache(strict=False) + + if NoImplicit: + # infer bond orders and formal charges from the connectivity + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol, max_iter) + + # sanitize + Chem.SanitizeMol(mol) + + return mol - return mol + +def set_converter_cache_size(maxsize): + """Set the maximum cache size of the RDKit converter + + Parameters + ---------- + maxsize : int or None + If int, the cache will only keep the ``maxsize`` most recent + conversions in memory. Using ``maxsize=None`` will remove all limits + to the cache size, i.e. everything is cached. + """ + global atomgroup_to_mol + atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__) def _add_mda_attr_to_rdkit(attr, value, mi): @@ -753,7 +752,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0]=O)]") backtrack = [] for _ in range(max_iter): - # simplest case where n=1 + # check for ending pattern end_match = mol.GetSubstructMatch(end_pattern) if end_match: # index of each atom @@ -766,6 +765,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): if neighbor.GetAtomicNum() == 8 and bond.GetBondTypeAsDouble() == 2: bond.SetBondType(Chem.BondType.SINGLE) neighbor.SetFormalCharge(-1) + break else: # [*-]-*=*-N if term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 0: @@ -773,7 +773,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # [*-]-*=*-[*-] else: end_charge = 0 - mol.GetAtomWithIdx(anion2).SetFormalCharge(end_charge) + term_atom.SetFormalCharge(end_charge) # common part of the conjugated systems: [*-]-*=* mol.GetAtomWithIdx(anion1).SetFormalCharge(0) mol.GetBondBetweenAtoms(anion1, a1).SetBondType( @@ -784,6 +784,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): mol.UpdatePropertyCache(strict=False) # shorten the anion-anion pattern from n to n-1 + # will not match if ending pattern has been reached matches = mol.GetSubstructMatches(pattern) if matches: # check if we haven't already transformed this triplet @@ -798,14 +799,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): anion, a1, a2 = match backtrack.append(g) break - else: - # tried all paths and now blocked from backtracking - warnings.warn("The standardization got stuck in a dead-end, " - "likely because of a charge in a conjugated " - "system. Please check the output molecule " - "carefully as it is likely to be an unreasonable" - " resonance structure") - return + # charges mol.GetAtomWithIdx(anion).SetFormalCharge(0) mol.GetAtomWithIdx(a2).SetFormalCharge(-1) From 4b6afc10fdbce1219f7a097a7b6daf092048264a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 14 Oct 2020 17:59:36 +0200 Subject: [PATCH 04/47] minor comments --- package/MDAnalysis/coordinates/RDKit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 81e24fcb6ec..7f749726750 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -758,7 +758,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # index of each atom anion1, a1, a2, anion2 = end_match term_atom = mol.GetAtomWithIdx(anion2) - # [*-]-*=*-C=O + # [*-]-*=*-C=O --> *=*-*=C-[O-] if term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0: for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) @@ -767,10 +767,10 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): neighbor.SetFormalCharge(-1) break else: - # [*-]-*=*-N + # [*-]-*=*-N --> *=*-*=[N+] if term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 0: end_charge = 1 - # [*-]-*=*-[*-] + # [*-]-*=*-[*-] --> *=*-*=* else: end_charge = 0 term_atom.SetFormalCharge(end_charge) From c9513c7dbed2216a4c558d63dfe4ca5327f90c75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 14 Oct 2020 18:18:45 +0200 Subject: [PATCH 05/47] assign most likely charges to monatomic ions --- package/MDAnalysis/coordinates/RDKit.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 7f749726750..f0856d85784 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -511,11 +511,25 @@ def _infer_bo_and_charges(mol): R-C(-O)-O the first oxygen read will receive a double bond and the other one will be charged. It will also affect more complex conjugated systems. """ - atoms = sorted([a for a in mol.GetAtoms() if a.GetAtomicNum() > 1], reverse=True, key=lambda a: _get_nb_unpaired_electrons(a)[0]) + + MONOATOMIC_ION_CHARGES = { + 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, + 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, + 26: 2, # Fe could also be 3 + 13: 3, + 9: -1, 17: -1, 35: -1, 53: -1, + } + for atom in atoms: + # monatomic ions + if atom.GetDegree() == 0: + atom.SetFormalCharge(MONOATOMIC_ION_CHARGES.get( + atom.GetAtomicNum(), 0)) + mol.UpdatePropertyCache(strict=False) + continue # get NUE for each possible valence nue = _get_nb_unpaired_electrons(atom) # if there's only one possible valence state and the corresponding From 338dad3fba447739f71edabb97dea94f144844f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 14 Oct 2020 18:46:17 +0200 Subject: [PATCH 06/47] fix when 2 ending patterns are matched --- package/MDAnalysis/coordinates/RDKit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index f0856d85784..da42c3b8cf4 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -796,9 +796,9 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): mol.GetBondBetweenAtoms(a2, anion2).SetBondType( Chem.BondType.DOUBLE) mol.UpdatePropertyCache(strict=False) + continue # shorten the anion-anion pattern from n to n-1 - # will not match if ending pattern has been reached matches = mol.GetSubstructMatches(pattern) if matches: # check if we haven't already transformed this triplet From 029ab1f1d68eb7d85ff18db651adf5412661b435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 15 Oct 2020 15:41:50 +0200 Subject: [PATCH 07/47] skip when backtrack prevents changes --- package/MDAnalysis/coordinates/RDKit.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index da42c3b8cf4..dbf1afae268 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -813,6 +813,9 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): anion, a1, a2 = match backtrack.append(g) break + else: + # already performed all changes + continue # charges mol.GetAtomWithIdx(anion).SetFormalCharge(0) From 046383ecfd155b2c20d4f03455e8c823d7bab9be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 27 Oct 2020 19:25:45 +0100 Subject: [PATCH 08/47] improvements for Nterm, sulfones, and nitrogen next to a charged atom --- package/MDAnalysis/coordinates/RDKit.py | 64 +++++++++---------- .../MDAnalysisTests/coordinates/test_rdkit.py | 10 +++ 2 files changed, 41 insertions(+), 33 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index dbf1afae268..c93e98ba696 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -628,7 +628,7 @@ def _standardize_patterns(mol, max_iter=200): +---------------+------------------------------------------------------------------------------+ | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | +---------------+------------------------------------------------------------------------------+ - | Nterm | ``[N-;X2;H1:1]>>[N+0:1]`` | + | Nterm | ``[N-;H1;$(N-[*^3]):1]>>[N+0:1]`` | +---------------+------------------------------------------------------------------------------+ | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | +---------------+------------------------------------------------------------------------------+ @@ -636,9 +636,9 @@ def _standardize_patterns(mol, max_iter=200): +---------------+------------------------------------------------------------------------------+ | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| +---------------+------------------------------------------------------------------------------+ - | sulfone | ``[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3]`` | + | sulfone | ``[S;X4;v4:1](-[*-:2])-[*-:3]>>[S:1](=[*+0:2])=[*+0:3]`` | +---------------+------------------------------------------------------------------------------+ - | charged N | ``[#7+0;v3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | + | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | +---------------+------------------------------------------------------------------------------+ """ @@ -646,24 +646,26 @@ def _standardize_patterns(mol, max_iter=200): # standardize conjugated systems _rebuild_conjugated_bonds(mol, max_iter) + # list of sanitized reactions + reactions = [] + for name, reaction in [ + ("Cterm", "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]"), + ("Nterm", "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]"), + ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), + ("ARG", "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]"), + ("HIP", "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" + ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]"), + ("sulfone", "[S;X4;v4:1](-[*-:2])-[*-:3]" + ">>[S:1](=[*+0:2])=[*+0:3]"), + ("charged-N", "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]"), + ]: + rxn = AllChem.ReactionFromSmarts(reaction) + reactions.append(rxn) + fragments = [] for reactant in Chem.GetMolFrags(mol, asMols=True): - - for name, reaction in [ - ("Cterm", "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]"), - ("Nterm", "[N-;X2;H1:1]>>[N+0:1]"), - ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), - ("ARG", "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]"), - ("HIP", "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" - ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]"), - ("sulfone", "[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]" - ">>[S:1](=[O+0:2])=[O+0:3]"), - ("charged-N", "[#7+0;v3:1]-[*-:2]>>[#7+:1]=[*+0:2]"), - ]: - reactant.UpdatePropertyCache(strict=False) - Chem.Kekulize(reactant) + for reaction in reactions: reactant = _run_reaction(reaction, reactant) - fragments.append(reactant) # recombine fragments @@ -683,8 +685,8 @@ def _run_reaction(reaction, reactant): Parameters ---------- - reaction : str - SMARTS reaction + reaction : rdkit.Chem.rdChemReactions.ChemicalReaction + Reaction from SMARTS reactant : rdkit.Chem.rdchem.Mol The molecule to transform @@ -693,27 +695,22 @@ def _run_reaction(reaction, reactant): product : rdkit.Chem.rdchem.Mol The final product of the reaction """ - # count how many times the reaction should be run - pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) - n_matches = len(reactant.GetSubstructMatches(pattern)) - - # run the reaction for each matched pattern - rxn = AllChem.ReactionFromSmarts(reaction) - for n in range(n_matches): - products = rxn.RunReactants((reactant,)) + for n in range(reactant.GetNumAtoms()): + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + products = reaction.RunReactants((reactant,)) # only keep the first product if products: product = products[0][0] # map back atom properties from the reactant to the product _reassign_props_after_reaction(reactant, product) # apply the next reaction to the product - product.UpdatePropertyCache(strict=False) reactant = product else: - # exit the n_matches loop if there's no product. Example - # where this is needed: SO^{4}_{2-} will match the sulfone - # pattern 6 times but the reaction is only needed once + # exit the loop if there's no product break + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) return reactant @@ -815,7 +812,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): break else: # already performed all changes - continue + anion, a1, a2 = matches[0] # charges mol.GetAtomWithIdx(anion).SetFormalCharge(0) @@ -829,6 +826,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): continue # no more changes to apply + mol.UpdatePropertyCache(strict=False) return # reached max_iter diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 017954390f9..a88d910a399 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -495,6 +495,14 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "[O-][n+]1cccnc1", "C[n+]1ccccc1", "[PH4+]", + "c1nc[nH]n1", + "CC(=O)C=C(C)N", + "CC(C)=CC=C[O-]", + "O=S(C)(C)=NC", + "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", + # still needs fixing + # "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", + # "C(C#CCCCCCCCC(=O)O)#CC=CCCCC", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders @@ -506,6 +514,7 @@ def test_order_independant(self, smi_in): for bond in template.GetBonds(): bond.SetIsAromatic(False) bond.SetBondType(Chem.BondType.SINGLE) + template.UpdatePropertyCache(strict=False) # go through each possible starting atom for a in template.GetAtoms(): @@ -513,6 +522,7 @@ def test_order_independant(self, smi_in): m = Chem.MolFromSmiles(smi, sanitize=False) for atom in m.GetAtoms(): atom.SetFormalCharge(0) + atom.SetIsAromatic(False) atom.SetNoImplicit(True) m.UpdatePropertyCache(strict=False) _infer_bo_and_charges(m) From bd12bc1d0f5de7739ded8ce8a65231251a91e50c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 28 Oct 2020 19:56:35 +0100 Subject: [PATCH 09/47] better handling of ring conjugated systems and conjugated triple bonds --- package/MDAnalysis/coordinates/RDKit.py | 71 ++++++++++--------- .../MDAnalysisTests/coordinates/test_rdkit.py | 5 +- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index c93e98ba696..15441883c32 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -515,7 +515,7 @@ def _infer_bo_and_charges(mol): reverse=True, key=lambda a: _get_nb_unpaired_electrons(a)[0]) - MONOATOMIC_ION_CHARGES = { + MONATOMIC_ION_CHARGES = { 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, 26: 2, # Fe could also be 3 @@ -526,8 +526,9 @@ def _infer_bo_and_charges(mol): for atom in atoms: # monatomic ions if atom.GetDegree() == 0: - atom.SetFormalCharge(MONOATOMIC_ION_CHARGES.get( - atom.GetAtomicNum(), 0)) + atom.SetFormalCharge(MONATOMIC_ION_CHARGES.get( + atom.GetAtomicNum(), + _get_nb_unpaired_electrons(atom)[0])) mol.UpdatePropertyCache(strict=False) continue # get NUE for each possible valence @@ -545,7 +546,7 @@ def _infer_bo_and_charges(mol): neighbors = sorted(atom.GetNeighbors(), reverse=True, key=lambda a: _get_nb_unpaired_electrons(a)[0]) # check if one of the neighbors has a common NUE - for i, na in enumerate(neighbors, start=1): + for na in neighbors: # get NUE for the neighbor na_nue = _get_nb_unpaired_electrons(na) # smallest common NUE @@ -561,15 +562,16 @@ def _infer_bo_and_charges(mol): order = common_nue + 1 bond.SetBondType(RDBONDORDER[order]) mol.UpdatePropertyCache(strict=False) - if i < len(neighbors): - # recalculate nue for atom - nue = _get_nb_unpaired_electrons(atom) + # go to next atom if one of the valences is complete + nue = _get_nb_unpaired_electrons(atom) + if any([n == 0 for n in nue]): + break # if atom valence is still not filled - nue_list = _get_nb_unpaired_electrons(atom) - if not any(nue == 0 for nue in nue_list): + nue = _get_nb_unpaired_electrons(atom) + if not any([n == 0 for n in nue]): # transform nue to charge - atom.SetFormalCharge(-nue_list[0]) + atom.SetFormalCharge(-nue[0]) atom.SetNumRadicalElectrons(0) mol.UpdatePropertyCache(strict=False) @@ -620,7 +622,7 @@ def _standardize_patterns(mol, max_iter=200): +---------------+------------------------------------------------------------------------------+ | Name | Reaction | +===============+==============================================================================+ - | conjugated | ``[*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | + | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | +---------------+------------------------------------------------------------------------------+ | conjugated-N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | +---------------+------------------------------------------------------------------------------+ @@ -648,7 +650,7 @@ def _standardize_patterns(mol, max_iter=200): # list of sanitized reactions reactions = [] - for name, reaction in [ + for name, rxn in [ ("Cterm", "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]"), ("Nterm", "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]"), ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), @@ -659,8 +661,8 @@ def _standardize_patterns(mol, max_iter=200): ">>[S:1](=[*+0:2])=[*+0:3]"), ("charged-N", "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]"), ]: - rxn = AllChem.ReactionFromSmarts(reaction) - reactions.append(rxn) + reaction = AllChem.ReactionFromSmarts(rxn) + reactions.append(reaction) fragments = [] for reactant in Chem.GetMolFrags(mol, asMols=True): @@ -733,6 +735,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can control that we don't transform the same triplet of atoms back and forth by adding their indices to a list. + This function also handles conjugated systems with triple bonds. The molecule needs to be kekulized first to also cover systems with aromatic rings. @@ -745,7 +748,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): """ mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) - pattern = Chem.MolFromSmarts("[*-]-[*+0]=[*+0;!O]") + pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0;!O]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) @@ -754,7 +757,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): return # check if there's an even number of anion-*=* patterns elif n_matches % 2 == 0: - end_pattern = Chem.MolFromSmarts("[*-]-[*+0]=[*+0]-[*-]") + end_pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") else: # as a last resort, the only way to standardize is to find a nitrogen # that can accept a double bond and a positive charge @@ -778,20 +781,16 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): neighbor.SetFormalCharge(-1) break else: - # [*-]-*=*-N --> *=*-*=[N+] - if term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 0: - end_charge = 1 - # [*-]-*=*-[*-] --> *=*-*=* - else: - end_charge = 0 - term_atom.SetFormalCharge(end_charge) + term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) # common part of the conjugated systems: [*-]-*=* - mol.GetAtomWithIdx(anion1).SetFormalCharge(0) - mol.GetBondBetweenAtoms(anion1, a1).SetBondType( - Chem.BondType.DOUBLE) - mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) - mol.GetBondBetweenAtoms(a2, anion2).SetBondType( - Chem.BondType.DOUBLE) + a = mol.GetAtomWithIdx(anion1) + a.SetFormalCharge(a.GetFormalCharge() + 1) + b = mol.GetBondBetweenAtoms(anion1, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + b = mol.GetBondBetweenAtoms(a2, anion2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) mol.UpdatePropertyCache(strict=False) continue @@ -813,14 +812,18 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): else: # already performed all changes anion, a1, a2 = matches[0] + backtrack = [] # charges - mol.GetAtomWithIdx(anion).SetFormalCharge(0) - mol.GetAtomWithIdx(a2).SetFormalCharge(-1) + a = mol.GetAtomWithIdx(anion) + a.SetFormalCharge(a.GetFormalCharge() + 1) + a = mol.GetAtomWithIdx(a2) + a.SetFormalCharge(a.GetFormalCharge() - 1) # bonds - mol.GetBondBetweenAtoms(anion, a1).SetBondType( - Chem.BondType.DOUBLE) - mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) + b = mol.GetBondBetweenAtoms(anion, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) mol.UpdatePropertyCache(strict=False) # start new iteration continue diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index a88d910a399..647f51ca632 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -500,9 +500,8 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "CC(C)=CC=C[O-]", "O=S(C)(C)=NC", "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", - # still needs fixing - # "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", - # "C(C#CCCCCCCCC(=O)O)#CC=CCCCC", + "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", + "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From bd14836837eba579b996e504b044f1390c578b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 28 Oct 2020 20:04:59 +0100 Subject: [PATCH 10/47] revert rdkitcache branch changes --- package/MDAnalysis/coordinates/RDKit.py | 249 ++++++++++++------------ 1 file changed, 126 insertions(+), 123 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 15441883c32..bf104fda9c9 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -245,13 +245,19 @@ class RDKitConverter(base.ConverterBase): Since one of the main use case of the converter is converting trajectories and not just a topology, creating a new molecule from scratch for every frame would be too slow so the converter uses a caching system. The cache - only remembers the 2 most recent AtomGroups that were converted, as well - as the arguments that were passed to the converter. The number of objects - cached can be changed with the function :func:`set_converter_cache_size`. - However, ``ag.convert_to("RDKIT")`` followed by ``ag.convert_to("RDKIT", NoImplicit=False)`` - will not use the cache since the arguments given are different. - You can pass a ``cache=False`` argument to the converter to bypass the - caching system. + only remembers the id of the last AtomGroup that was converted, as well + as the arguments that were passed to the converter. This means that using + ``u.select_atoms("protein").convert_to("RDKIT")`` will not benefit from the + cache since the selection is deleted from memory as soon as the conversion + is finished. Instead, users should do this in two steps by first saving the + selection in a variable and then converting the saved AtomGroup. It also + means that ``ag.convert_to("RDKIT")`` followed by + ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the cache. + Finally if you're modifying the AtomGroup in place between two conversions, + the id of the AtomGroup won't change and thus the converter will use the + cached molecule. For this reason, you can pass a ``cache=False`` argument + to the converter to bypass the caching system. + Note that the cached molecule doesn't contain the coordinates of the atoms. .. versionadded:: 2.0.0 @@ -260,6 +266,7 @@ class RDKitConverter(base.ConverterBase): lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} + _cache = dict() def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): """Write selection at current trajectory frame to @@ -272,14 +279,16 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the - same AtomGroup selection and with the same arguments passed - to the converter + same AtomGroup object (same id) and with the same arguments passed + to the converter (with the exception of this `cache` argument) NoImplicit : bool Prevent adding hydrogens to the molecule max_iter : int Maximum number of iterations to standardize conjugated systems. See :func:`_rebuild_conjugated_bonds` """ + # parameters passed to atomgroup_to_mol and used by the cache + kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter) try: from rdkit import Chem @@ -295,13 +304,22 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): "please use a valid AtomGroup or Universe".format( type(obj))) from None - # parameters passed to atomgroup_to_mol - kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter) if cache: - mol = atomgroup_to_mol(ag, **kwargs) + # key used to search the cache + key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}" + for key, value in kwargs.items()) + try: + mol = self._cache[key] + except KeyError: + # only keep the current molecule in cache + self._cache.clear() + # create the topology + self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs) + # continue on copy of the cached molecule mol = copy.deepcopy(mol) else: - mol = atomgroup_to_mol.__wrapped__(ag, **kwargs) + self._cache.clear() + mol = self.atomgroup_to_mol(ag, **kwargs) # add a conformer for the current Timestep if hasattr(ag, "positions"): @@ -323,123 +341,108 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): return mol -@lru_cache(maxsize=2) -def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200): - """Converts an AtomGroup to an RDKit molecule without coordinates. + def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200): + """Converts an AtomGroup to an RDKit molecule without coordinates. - Parameters - ----------- - ag : MDAnalysis.core.groups.AtomGroup - The AtomGroup to convert - NoImplicit : bool - Prevent adding hydrogens to the molecule - max_iter : int - Maximum number of iterations to standardize conjugated systems. - See :func:`_rebuild_conjugated_bonds` - """ - try: - elements = ag.elements - except NoDataError: - raise AttributeError( - "The `elements` attribute is required for the RDKitConverter " - "but is not present in this AtomGroup. Please refer to the " - "documentation to guess elements from other attributes or " - "type `help(mda.topology.guessers)`") from None - - if "H" not in ag.elements: - warnings.warn( - "No hydrogen atom could be found in the topology, but the " - "converter requires all hydrogens to be explicit. Please " - "check carefully the output molecule as the converter is " - "likely to add negative charges and assign incorrect bond " - "orders to structures with implicit hydrogens. Alternatively, " - "you can use the parameter `NoImplicit=False` when using the " - "converter to allow implicit hydrogens and disable inferring " - "bond orders and charges." - ) - - # attributes accepted in PDBResidueInfo object - pdb_attrs = {} - if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): - raise AttributeError( - "Both `tempfactors` and `bfactors` attributes are present but " - "only one can be assigned to the RDKit molecule. Please " - "delete the unnecessary one and retry." - ) - for attr in RDATTRIBUTES.keys(): - if hasattr(ag, attr): - pdb_attrs[attr] = getattr(ag, attr) - - other_attrs = {} - for attr in ["charges", "segids", "types"]: - if hasattr(ag, attr): - other_attrs[attr] = getattr(ag, attr) - - mol = Chem.RWMol() - # map index in universe to index in mol - atom_mapper = {} - - for i, (atom, element) in enumerate(zip(ag, elements)): - # create atom - rdatom = Chem.Atom(element.capitalize()) - # enable/disable adding implicit H to the molecule - rdatom.SetNoImplicit(NoImplicit) - # add PDB-like properties - mi = Chem.AtomPDBResidueInfo() - for attr, values in pdb_attrs.items(): - _add_mda_attr_to_rdkit(attr, values[i], mi) - rdatom.SetMonomerInfo(mi) - # other properties - for attr in other_attrs.keys(): - value = other_attrs[attr][i] - attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular - _set_atom_property(rdatom, attr, value) - _set_atom_property(rdatom, "_MDAnalysis_index", i) - # add atom - index = mol.AddAtom(rdatom) - atom_mapper[atom.ix] = index - - try: - ag.bonds - except NoDataError: - warnings.warn( - "No `bonds` attribute in this AtomGroup. Guessing bonds based " - "on atoms coordinates") - ag.guess_bonds() - - for bond in ag.bonds: + Parameters + ----------- + ag : MDAnalysis.core.groups.AtomGroup + The AtomGroup to convert + NoImplicit : bool + Prevent adding hydrogens to the molecule + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :func:`_rebuild_conjugated_bonds` + """ try: - bond_indices = [atom_mapper[i] for i in bond.indices] - except KeyError: - continue - bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) - mol.AddBond(*bond_indices, bond_type) - - mol.UpdatePropertyCache(strict=False) - - if NoImplicit: - # infer bond orders and formal charges from the connectivity - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol, max_iter) + elements = ag.elements + except NoDataError: + raise AttributeError( + "The `elements` attribute is required for the RDKitConverter " + "but is not present in this AtomGroup. Please refer to the " + "documentation to guess elements from other attributes or " + "type `help(mda.topology.guessers)`") from None + + if "H" not in ag.elements: + warnings.warn( + "No hydrogen atom could be found in the topology, but the " + "converter requires all hydrogens to be explicit. Please " + "check carefully the output molecule as the converter is " + "likely to add negative charges and assign incorrect bond " + "orders to structures with implicit hydrogens. Alternatively, " + "you can use the parameter `NoImplicit=False` when using the " + "converter to allow implicit hydrogens and disable inferring " + "bond orders and charges." + ) + + # attributes accepted in PDBResidueInfo object + pdb_attrs = {} + if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): + raise AttributeError( + "Both `tempfactors` and `bfactors` attributes are present but " + "only one can be assigned to the RDKit molecule. Please " + "delete the unnecessary one and retry." + ) + for attr in RDATTRIBUTES.keys(): + if hasattr(ag, attr): + pdb_attrs[attr] = getattr(ag, attr) + + other_attrs = {} + for attr in ["charges", "segids", "types"]: + if hasattr(ag, attr): + other_attrs[attr] = getattr(ag, attr) + + mol = Chem.RWMol() + # map index in universe to index in mol + atom_mapper = {} + + for i, (atom, element) in enumerate(zip(ag, elements)): + # create atom + rdatom = Chem.Atom(element.capitalize()) + # enable/disable adding implicit H to the molecule + rdatom.SetNoImplicit(NoImplicit) + # add PDB-like properties + mi = Chem.AtomPDBResidueInfo() + for attr, values in pdb_attrs.items(): + _add_mda_attr_to_rdkit(attr, values[i], mi) + rdatom.SetMonomerInfo(mi) + # other properties + for attr in other_attrs.keys(): + value = other_attrs[attr][i] + attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular + _set_atom_property(rdatom, attr, value) + _set_atom_property(rdatom, "_MDAnalysis_index", i) + # add atom + index = mol.AddAtom(rdatom) + atom_mapper[atom.ix] = index - # sanitize - Chem.SanitizeMol(mol) + try: + ag.bonds + except NoDataError: + warnings.warn( + "No `bonds` attribute in this AtomGroup. Guessing bonds based " + "on atoms coordinates") + ag.guess_bonds() + + for bond in ag.bonds: + try: + bond_indices = [atom_mapper[i] for i in bond.indices] + except KeyError: + continue + bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) + mol.AddBond(*bond_indices, bond_type) - return mol + mol.UpdatePropertyCache(strict=False) + if NoImplicit: + # infer bond orders and formal charges from the connectivity + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol, max_iter) -def set_converter_cache_size(maxsize): - """Set the maximum cache size of the RDKit converter + # sanitize + Chem.SanitizeMol(mol) - Parameters - ---------- - maxsize : int or None - If int, the cache will only keep the ``maxsize`` most recent - conversions in memory. Using ``maxsize=None`` will remove all limits - to the cache size, i.e. everything is cached. - """ - global atomgroup_to_mol - atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__) + return mol def _add_mda_attr_to_rdkit(attr, value, mi): From 0e7d4e486a950c0a45a90b1539c67be0f22193d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 13 Nov 2020 02:51:14 +0100 Subject: [PATCH 11/47] monatomic default charges handling --- package/MDAnalysis/coordinates/RDKit.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index bf104fda9c9..8d0b93e1e30 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -518,20 +518,19 @@ def _infer_bo_and_charges(mol): reverse=True, key=lambda a: _get_nb_unpaired_electrons(a)[0]) - MONATOMIC_ION_CHARGES = { + MONATOMIC_CATION_CHARGES = { 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, - 26: 2, # Fe could also be 3 + 26: 2, # Fe could also be +3 13: 3, - 9: -1, 17: -1, 35: -1, 53: -1, } for atom in atoms: # monatomic ions if atom.GetDegree() == 0: - atom.SetFormalCharge(MONATOMIC_ION_CHARGES.get( + atom.SetFormalCharge(MONATOMIC_CATION_CHARGES.get( atom.GetAtomicNum(), - _get_nb_unpaired_electrons(atom)[0])) + -_get_nb_unpaired_electrons(atom)[0])) mol.UpdatePropertyCache(strict=False) continue # get NUE for each possible valence From d9b4d95024353eb7877755921d6258e40c904fa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 17 Nov 2020 19:43:37 +0100 Subject: [PATCH 12/47] fix multiple types of end pattern + test --- package/MDAnalysis/coordinates/RDKit.py | 26 +++++++++++++------ .../MDAnalysisTests/coordinates/test_rdkit.py | 1 + 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 8d0b93e1e30..7fc2ddbf522 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -751,21 +751,25 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0;!O]") + base_end_pattern = Chem.MolFromSmarts( + "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") + odd_end_pattern = Chem.MolFromSmarts( + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0]=O)]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) if n_matches == 0: # nothing to standardize return - # check if there's an even number of anion-*=* patterns - elif n_matches % 2 == 0: - end_pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") - else: + elif n_matches == 1: # as a last resort, the only way to standardize is to find a nitrogen # that can accept a double bond and a positive charge # or a carbonyl that will become an enolate - end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0]=O)]") + end_pattern = odd_end_pattern + else: + # give priority to base end pattern and then deal with the odd one if + # necessary + end_pattern = base_end_pattern backtrack = [] for _ in range(max_iter): # check for ending pattern @@ -775,10 +779,12 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): anion1, a1, a2, anion2 = end_match term_atom = mol.GetAtomWithIdx(anion2) # [*-]-*=*-C=O --> *=*-*=C-[O-] - if term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0: + if (term_atom.GetAtomicNum() == 6 and + term_atom.GetFormalCharge() == 0): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if neighbor.GetAtomicNum() == 8 and bond.GetBondTypeAsDouble() == 2: + if (neighbor.GetAtomicNum() == 8 and + bond.GetBondTypeAsDouble() == 2): bond.SetBondType(Chem.BondType.SINGLE) neighbor.SetFormalCharge(-1) break @@ -827,6 +833,10 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): b = mol.GetBondBetweenAtoms(a1, a2) b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) mol.UpdatePropertyCache(strict=False) + # update number of matches for the end pattern + n_matches = len(set([match[0] for match in matches])) + if n_matches == 1: + end_pattern = odd_end_pattern # start new iteration continue diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 647f51ca632..7c5d1e95dc6 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -502,6 +502,7 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", + "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1" ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From b36eab62a1ffb697bc56b0a585edf9376e9e40d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 18 Nov 2020 15:51:02 +0100 Subject: [PATCH 13/47] fix some sulfur and nitrogen compounds --- package/MDAnalysis/coordinates/RDKit.py | 32 ++++++++++++++----- .../MDAnalysisTests/coordinates/test_rdkit.py | 5 ++- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 7fc2ddbf522..99c97cb1349 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -659,8 +659,7 @@ def _standardize_patterns(mol, max_iter=200): ("ARG", "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]"), ("HIP", "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]"), - ("sulfone", "[S;X4;v4:1](-[*-:2])-[*-:3]" - ">>[S:1](=[*+0:2])=[*+0:3]"), + ("sulfone", "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]"), ("charged-N", "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]"), ]: reaction = AllChem.ReactionFromSmarts(rxn) @@ -754,7 +753,8 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): base_end_pattern = Chem.MolFromSmarts( "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") odd_end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0]=O)]") + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," + "$([S;D4;v4]-[O-])]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) @@ -778,9 +778,14 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # index of each atom anion1, a1, a2, anion2 = end_match term_atom = mol.GetAtomWithIdx(anion2) - # [*-]-*=*-C=O --> *=*-*=C-[O-] - if (term_atom.GetAtomicNum() == 6 and - term_atom.GetFormalCharge() == 0): + # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] + if ( + term_atom.GetAtomicNum() == 6 + and term_atom.GetFormalCharge() == 0 + ) or ( + term_atom.GetAtomicNum() == 7 + and term_atom.GetFormalCharge() == 1 + ): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) if (neighbor.GetAtomicNum() == 8 and @@ -788,6 +793,16 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): bond.SetBondType(Chem.BondType.SINGLE) neighbor.SetFormalCharge(-1) break + # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O + elif term_atom.GetAtomicNum() == 16: + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if (neighbor.GetAtomicNum() == 8 and + neighbor.GetFormalCharge() == -1 and + bond.GetBondTypeAsDouble() == 1): + bond.SetBondType(Chem.BondType.DOUBLE) + neighbor.SetFormalCharge(0) + break else: term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) # common part of the conjugated systems: [*-]-*=* @@ -819,8 +834,9 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): break else: # already performed all changes - anion, a1, a2 = matches[0] - backtrack = [] + match = matches[0] + anion, a1, a2 = match + backtrack = [set(match)] # charges a = mol.GetAtomWithIdx(anion) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 7c5d1e95dc6..395ade91b07 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -502,7 +502,10 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", - "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1" + "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", + "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", + "[O-][n+]1onc2ccccc21", + ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From 555eab302fa35fb2921e59bd4f2514cfea91f091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 18 Nov 2020 16:09:52 +0100 Subject: [PATCH 14/47] add amino and nucleic acids test --- .../MDAnalysisTests/coordinates/test_rdkit.py | 123 ++++++++++++------ 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 395ade91b07..20cf7881ae3 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -363,6 +363,20 @@ def test_cache(self): @requires_rdkit class TestRDKitFunctions(object): + + def mol_to_template(self, mol): + """Remove bond orders and charges from molecule""" + template = Chem.AddHs(mol) + for atom in template.GetAtoms(): + atom.SetIsAromatic(False) + atom.SetFormalCharge(0) + atom.SetNoImplicit(True) + for bond in template.GetBonds(): + bond.SetIsAromatic(False) + bond.SetBondType(Chem.BondType.SINGLE) + template.UpdatePropertyCache(strict=False) + return template + @pytest.mark.parametrize("smi, out", [ ("C(-[H])(-[H])(-[H])-[H]", "C"), ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), @@ -473,52 +487,75 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): with pytest.raises(KeyError, match="_MDAnalysis_index"): atom.GetIntProp("_MDAnalysis_index") - @pytest.mark.parametrize("smi_in", [ - "c1ccc(cc1)-c1ccccc1-c1ccccc1", - "c1cc[nH]c1", - "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", - "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", - "c1csc(c1)-c1ccoc1-c1cc[nH]c1", - "C1=C2C(=NC=N1)N=CN2", - "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", - "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", - "C=CC=CC=CC=CC=CC=C", - "NCCCCC([NH3+])C(=O)[O-]", - "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", - "C#CC=C", + @pytest.mark.parametrize("input_type, input_str", [ + ("smi","c1ccc(cc1)-c1ccccc1-c1ccccc1"), + ("smi","c1cc[nH]c1"), + ("smi","c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1"), + ("smi","c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21"), + ("smi","c1csc(c1)-c1ccoc1-c1cc[nH]c1"), + ("smi","C1=C2C(=NC=N1)N=CN2"), + ("smi","CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]"), + ("smi","c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1"), + ("smi","C=CC=CC=CC=CC=CC=C"), + ("smi","NCCCCC([NH3+])C(=O)[O-]"), + ("smi","CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C"), + ("smi","C#CC=C"), # HID HIE HIP residues, see PR #2941 - "O=C([C@H](CC1=CNC=N1)N)O", - "O=C([C@H](CC1=CN=CN1)N)O", - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + ("smi","O=C([C@H](CC1=CNC=N1)N)O"), + ("smi","O=C([C@H](CC1=CN=CN1)N)O"), + ("smi","O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]"), # fixes from PR #??? - "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", - "[O-][n+]1cccnc1", - "C[n+]1ccccc1", - "[PH4+]", - "c1nc[nH]n1", - "CC(=O)C=C(C)N", - "CC(C)=CC=C[O-]", - "O=S(C)(C)=NC", - "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", - "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", - "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", - "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", - "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", - "[O-][n+]1onc2ccccc21", - + ("smi","CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1"), + ("smi","[O-][n+]1cccnc1"), + ("smi","C[n+]1ccccc1"), + ("smi","[PH4+]"), + ("smi","c1nc[nH]n1"), + ("smi","CC(=O)C=C(C)N"), + ("smi","CC(C)=CC=C[O-]"), + ("smi","O=S(C)(C)=NC"), + ("smi","Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1"), + ("smi","CCCC/C=C/C#CC#CCCCCCCCC(=O)O"), + ("smi","c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1"), + ("smi","N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1"), + ("smi","C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N"), + ("smi","[O-][n+]1onc2ccccc21"), + # test amino acids + ("aa", "A"), + ("aa", "G"), + ("aa", "I"), + ("aa", "L"), + ("aa", "P"), + ("aa", "V"), + ("aa", "F"), + ("aa", "W"), + ("aa", "Y"), + ("aa", "D"), + ("aa", "E"), + ("aa", "R"), + ("aa", "H"), + ("aa", "K"), + ("aa", "S"), + ("aa", "T"), + ("aa", "C"), + ("aa", "M"), + ("aa", "N"), + ("aa", "Q"), + # test nucleic acids + ("na", "A"), + ("na", "T"), + ("na", "U"), + ("na", "C"), + ("na", "G"), ]) - def test_order_independant(self, smi_in): + def test_order_independant(self, input_type, input_str): # generate mol with hydrogens but without bond orders - ref = Chem.MolFromSmiles(smi_in) - template = Chem.AddHs(ref) - for atom in template.GetAtoms(): - atom.SetIsAromatic(False) - atom.SetFormalCharge(0) - for bond in template.GetBonds(): - bond.SetIsAromatic(False) - bond.SetBondType(Chem.BondType.SINGLE) - template.UpdatePropertyCache(strict=False) - + if input_type == "smi": + ref = Chem.MolFromSmiles(input_str) + elif input_type == "na": + ref = Chem.MolFromSequence(input_str, flavor=9) + else: + ref = Chem.MolFromSequence(input_str) + template = self.mol_to_template(ref) # go through each possible starting atom for a in template.GetAtoms(): smi = Chem.MolToSmiles(template, rootedAtAtom=a.GetIdx()) @@ -542,7 +579,7 @@ def test_order_independant(self, smi_in): assert match, (f"(input) {Chem.MolToSmiles(ref)} != " f"{Chem.MolToSmiles(m)} (output) " f"root atom {a.GetIdx()}") - + def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) From 3e4d066affd0cec479664e47967e2cbb200949f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 18 Nov 2020 16:27:21 +0100 Subject: [PATCH 15/47] add test for ions --- .../MDAnalysisTests/coordinates/test_rdkit.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 20cf7881ae3..82c464af284 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -586,3 +586,19 @@ def test_warn_conjugated_max_iter(self): with pytest.warns(UserWarning, match="reasonable number of iterations"): _rebuild_conjugated_bonds(mol, 2) + + @pytest.mark.parametrize("smi", [ + "[Li+]", "[Na+]", "[K+]", "[Ag+]", + "[Mg+2]", "[Ca+2]", "[Cu+2]", "[Zn+2]", "[Fe+2]", + "[Al+3]", + "[Cl-]", + "[O-2]", + "[Na+].[Cl-]", + ]) + def test_ions(self, smi): + ref = Chem.MolFromSmiles(smi) + mol = self.mol_to_template(ref) + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol) + Chem.SanitizeMol(mol) + assert mol.HasSubstructMatch(ref) and ref.HasSubstructMatch(mol) From ec54125c363f63dcd10d1e97c86c63c79494230b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 19 Nov 2020 15:46:19 +0100 Subject: [PATCH 16/47] final fixes --- package/MDAnalysis/coordinates/RDKit.py | 16 ++++++++++++++-- .../MDAnalysisTests/coordinates/test_rdkit.py | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 99c97cb1349..fca91f4acfd 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -749,7 +749,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): """ mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) - pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0;!O]") + pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") base_end_pattern = Chem.MolFromSmarts( "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") odd_end_pattern = Chem.MolFromSmarts( @@ -771,6 +771,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # necessary end_pattern = base_end_pattern backtrack = [] + backtrack_cycles = 0 for _ in range(max_iter): # check for ending pattern end_match = mol.GetSubstructMatch(end_pattern) @@ -794,7 +795,8 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): neighbor.SetFormalCharge(-1) break # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O - elif term_atom.GetAtomicNum() == 16: + elif (term_atom.GetAtomicNum() == 16 and + term_atom.GetFormalCharge() == 0): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) if (neighbor.GetAtomicNum() == 8 and @@ -834,9 +836,19 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): break else: # already performed all changes + if backtrack_cycles == 1: + # might be stuck because there were 2 "odd" end patterns + # misqualified as a single base one + end_pattern = odd_end_pattern + elif backtrack_cycles > 1: + # likely stuck changing the same bonds over and over so + # let's stop here + mol.UpdatePropertyCache(strict=False) + return match = matches[0] anion, a1, a2 = match backtrack = [set(match)] + backtrack_cycles += 1 # charges a = mol.GetAtomWithIdx(anion) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 82c464af284..c800909c012 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -519,6 +519,8 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): ("smi","N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1"), ("smi","C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N"), ("smi","[O-][n+]1onc2ccccc21"), + ("smi", "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1"), + ("smi", "[O-]c1ccccc1"), # test amino acids ("aa", "A"), ("aa", "G"), From 3167e43db6f60816a83a451d8ad31c4baa56ecf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 19 Nov 2020 20:17:48 +0100 Subject: [PATCH 17/47] fix docs and unused import --- package/MDAnalysis/coordinates/RDKit.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index fca91f4acfd..9470f8f3e95 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -67,7 +67,6 @@ import warnings import re import copy -from functools import lru_cache import numpy as np @@ -626,13 +625,15 @@ def _standardize_patterns(mol, max_iter=200): +===============+==============================================================================+ | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | +---------------+------------------------------------------------------------------------------+ - | conjugated-N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | + | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | +---------------+------------------------------------------------------------------------------+ - | conjugated-O- | ``[O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | +---------------+------------------------------------------------------------------------------+ | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | +---------------+------------------------------------------------------------------------------+ - | Nterm | ``[N-;H1;$(N-[*^3]):1]>>[N+0:1]`` | + | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | +---------------+------------------------------------------------------------------------------+ | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | +---------------+------------------------------------------------------------------------------+ @@ -640,7 +641,7 @@ def _standardize_patterns(mol, max_iter=200): +---------------+------------------------------------------------------------------------------+ | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| +---------------+------------------------------------------------------------------------------+ - | sulfone | ``[S;X4;v4:1](-[*-:2])-[*-:3]>>[S:1](=[*+0:2])=[*+0:3]`` | + | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | +---------------+------------------------------------------------------------------------------+ | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | +---------------+------------------------------------------------------------------------------+ From 7b6823b93d64998e97c72f6a39d6305bb2ed836a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 19 Nov 2020 20:34:32 +0100 Subject: [PATCH 18/47] pep8 --- package/MDAnalysis/coordinates/RDKit.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 9470f8f3e95..de77186d499 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -273,8 +273,8 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): Parameters ----------- - obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` - + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or + :class:`~MDAnalysis.core.universe.Universe` cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the @@ -339,9 +339,8 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): return mol - def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200): - """Converts an AtomGroup to an RDKit molecule without coordinates. + """Converts an AtomGroup to an RDKit molecule Parameters ----------- @@ -645,7 +644,7 @@ def _standardize_patterns(mol, max_iter=200): +---------------+------------------------------------------------------------------------------+ | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | +---------------+------------------------------------------------------------------------------+ - + """ # standardize conjugated systems @@ -796,7 +795,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): neighbor.SetFormalCharge(-1) break # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O - elif (term_atom.GetAtomicNum() == 16 and + elif (term_atom.GetAtomicNum() == 16 and term_atom.GetFormalCharge() == 0): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) From 3f4645fb3f0ba3fea6722f1434952483f5b6a6ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 11 May 2021 22:10:25 +0200 Subject: [PATCH 19/47] pep8 --- package/MDAnalysis/converters/RDKit.py | 8 +-- .../MDAnalysisTests/converters/test_rdkit.py | 63 ++++++++++--------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index fac25a82d88..ffe9758530c 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -268,7 +268,7 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, Parameters ----------- - obj : :class:`~MDAnalysis.core.groups.AtomGroup` or + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` cache : bool Use a cached copy of the molecule's topology when available. To be @@ -529,7 +529,7 @@ def _infer_bo_and_charges(mol): MONATOMIC_CATION_CHARGES = { 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, - 26: 2, # Fe could also be +3 + 26: 2, # Fe could also be +3 13: 3, } @@ -800,7 +800,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) if (neighbor.GetAtomicNum() == 8 and - bond.GetBondTypeAsDouble() == 2): + bond.GetBondTypeAsDouble() == 2): bond.SetBondType(Chem.BondType.SINGLE) neighbor.SetFormalCharge(-1) break @@ -811,7 +811,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) if (neighbor.GetAtomicNum() == 8 and neighbor.GetFormalCharge() == -1 and - bond.GetBondTypeAsDouble() == 1): + bond.GetBondTypeAsDouble() == 1): bond.SetBondType(Chem.BondType.DOUBLE) neighbor.SetFormalCharge(0) break diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 7b583bbfeef..b8bbe9c2198 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -535,37 +535,37 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): atom.GetIntProp("_MDAnalysis_index") @pytest.mark.parametrize("input_type, input_str", [ - ("smi","c1ccc(cc1)-c1ccccc1-c1ccccc1"), - ("smi","c1cc[nH]c1"), - ("smi","c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1"), - ("smi","c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21"), - ("smi","c1csc(c1)-c1ccoc1-c1cc[nH]c1"), - ("smi","C1=C2C(=NC=N1)N=CN2"), - ("smi","CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]"), - ("smi","c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1"), - ("smi","C=CC=CC=CC=CC=CC=C"), - ("smi","NCCCCC([NH3+])C(=O)[O-]"), - ("smi","CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C"), - ("smi","C#CC=C"), + ("smi", "c1ccc(cc1)-c1ccccc1-c1ccccc1"), + ("smi", "c1cc[nH]c1"), + ("smi", "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1"), + ("smi", "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21"), + ("smi", "c1csc(c1)-c1ccoc1-c1cc[nH]c1"), + ("smi", "C1=C2C(=NC=N1)N=CN2"), + ("smi", "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]"), + ("smi", "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1"), + ("smi", "C=CC=CC=CC=CC=CC=C"), + ("smi", "NCCCCC([NH3+])C(=O)[O-]"), + ("smi", "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C"), + ("smi", "C#CC=C"), # HID HIE HIP residues, see PR #2941 - ("smi","O=C([C@H](CC1=CNC=N1)N)O"), - ("smi","O=C([C@H](CC1=CN=CN1)N)O"), - ("smi","O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]"), + ("smi", "O=C([C@H](CC1=CNC=N1)N)O"), + ("smi", "O=C([C@H](CC1=CN=CN1)N)O"), + ("smi", "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]"), # fixes from PR #??? - ("smi","CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1"), - ("smi","[O-][n+]1cccnc1"), - ("smi","C[n+]1ccccc1"), - ("smi","[PH4+]"), - ("smi","c1nc[nH]n1"), - ("smi","CC(=O)C=C(C)N"), - ("smi","CC(C)=CC=C[O-]"), - ("smi","O=S(C)(C)=NC"), - ("smi","Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1"), - ("smi","CCCC/C=C/C#CC#CCCCCCCCC(=O)O"), - ("smi","c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1"), - ("smi","N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1"), - ("smi","C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N"), - ("smi","[O-][n+]1onc2ccccc21"), + ("smi", "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1"), + ("smi", "[O-][n+]1cccnc1"), + ("smi", "C[n+]1ccccc1"), + ("smi", "[PH4+]"), + ("smi", "c1nc[nH]n1"), + ("smi", "CC(=O)C=C(C)N"), + ("smi", "CC(C)=CC=C[O-]"), + ("smi", "O=S(C)(C)=NC"), + ("smi", "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1"), + ("smi", "CCCC/C=C/C#CC#CCCCCCCCC(=O)O"), + ("smi", "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1"), + ("smi", "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1"), + ("smi", "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N"), + ("smi", "[O-][n+]1onc2ccccc21"), ("smi", "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1"), ("smi", "[O-]c1ccccc1"), # test amino acids @@ -622,13 +622,14 @@ def test_order_independant(self, input_type, input_str): if not match: # try resonance structures for charged conjugated systems for mol in Chem.ResonanceMolSupplier(m, maxStructs=20): - match = mol.HasSubstructMatch(ref) and ref.HasSubstructMatch(mol) + match = (mol.HasSubstructMatch(ref) and + ref.HasSubstructMatch(mol)) if match: break assert match, (f"(input) {Chem.MolToSmiles(ref)} != " f"{Chem.MolToSmiles(m)} (output) " f"root atom {a.GetIdx()}") - + def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) From 20a64ae222828448a6fd978d6425687f652d572d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 11 May 2021 22:49:21 +0200 Subject: [PATCH 20/47] comments --- package/MDAnalysis/converters/RDKit.py | 28 +++++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index ffe9758530c..3436c4246e6 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -662,15 +662,15 @@ def _standardize_patterns(mol, max_iter=200): # list of sanitized reactions reactions = [] - for name, rxn in [ - ("Cterm", "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]"), - ("Nterm", "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]"), - ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), - ("ARG", "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]"), - ("HIP", "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" - ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]"), - ("sulfone", "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]"), - ("charged-N", "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]"), + for rxn in [ + "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm + "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm + "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate + "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG + "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" + ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP + "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone + "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N ]: reaction = AllChem.ReactionFromSmarts(rxn) reactions.append(reaction) @@ -789,7 +789,9 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # index of each atom anion1, a1, a2, anion2 = end_match term_atom = mol.GetAtomWithIdx(anion2) + # edge-case 1: C-[O-] or [N+]-[O-] # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] + # transform the =O to -[O-] if ( term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0 @@ -804,7 +806,9 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): bond.SetBondType(Chem.BondType.SINGLE) neighbor.SetFormalCharge(-1) break + # edge-case 2: S=O # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O + # transform -[O-] to =O elif (term_atom.GetAtomicNum() == 16 and term_atom.GetFormalCharge() == 0): for neighbor in term_atom.GetNeighbors(): @@ -815,9 +819,13 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): bond.SetBondType(Chem.BondType.DOUBLE) neighbor.SetFormalCharge(0) break + # not an edge case: + # increment the charge else: term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) - # common part of the conjugated systems: [*-]-*=* + # common to all cases: + # [*-]-*=*-[R] --> *=*-*=[R] + # increment the charge and switch single and double bonds a = mol.GetAtomWithIdx(anion1) a.SetFormalCharge(a.GetFormalCharge() + 1) b = mol.GetBondBetweenAtoms(anion1, a1) From 5c03ee8a3c89ab7e942fd296ef0c24d548d805ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 11 May 2021 22:54:46 +0200 Subject: [PATCH 21/47] fix rdkit pdb atom names formatting --- package/MDAnalysis/converters/RDKit.py | 10 +++++----- testsuite/MDAnalysisTests/converters/test_rdkit.py | 11 ++++++++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 3436c4246e6..39ba51a9e2d 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -474,14 +474,14 @@ def _add_mda_attr_to_rdkit(attr, value, mi): # PDB file (1 letter elements start at col 14) name = re.findall(r'(\D+|\d+)', value) if len(name) == 2: - symbol, number = name - if len(number) > 2 and len(symbol) == 1: - value = "{}{}".format(symbol, number) + if len(value) == 2: + value = f" {value} " else: - value = "{:>2.2}{:<2.2}".format(symbol, number) + symbol, number = name + value = f"{symbol + number:>4.4}" else: # no number in the name - value = " {:<}".format(name[0]) + value = f" {name[0]:<3.3}" # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index b8bbe9c2198..beb27918cb5 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -263,9 +263,14 @@ def test_warning_no_hydrogen_force(self, uo2): uo2.atoms.convert_to.rdkit(NoImplicit=False, force=True) @pytest.mark.parametrize("attr, value, expected", [ - ("names", "C1", " C1 "), - ("names", "C12", " C12"), - ("names", "Cl1", "Cl1 "), + ("names", "N", " N "), + ("names", "CA", " CA "), + ("names", "CAT", " CAT"), + ("names", "N1", " N1 "), + ("names", "CE2", " CE2"), + ("names", "C12", " C12"), + ("names", "HD12", "HD12"), + ("names", "C123", "C123"), ("altLocs", "A", "A"), ("chainIDs", "B", "B"), ("icodes", "C", "C"), From 44c523e5f1926386ce05cf957359fdab66145153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 11 May 2021 23:46:04 +0200 Subject: [PATCH 22/47] skip sanitization if fails --- package/MDAnalysis/converters/RDKit.py | 7 +++++-- testsuite/MDAnalysisTests/converters/test_rdkit.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 39ba51a9e2d..122ff1db446 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -433,8 +433,11 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): _infer_bo_and_charges(mol) mol = _standardize_patterns(mol, max_iter) - # sanitize - Chem.SanitizeMol(mol) + # sanitize if possible + err = Chem.SanitizeMol(mol, catchErrors=True) + if err: + warnings.warn("Could not sanitize molecule: " + f"failed during step {err!r}") return mol diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index beb27918cb5..f93600b3b20 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -410,6 +410,19 @@ def test_cache(self): new_cache = cached_func.cache_info() assert cache == new_cache + def test_sanitize_fail_warning(self): + mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", sanitize=False) + mol.UpdatePropertyCache(strict=False) + mol = Chem.AddHs(mol) + u = mda.Universe(mol) + with pytest.warns(UserWarning, match="Could not sanitize molecule"): + u.atoms.convert_to.rdkit(NoImplicit=False) + with pytest.warns(None) as record: + u.atoms.convert_to.rdkit() + if record: + assert all("Could not sanitize molecule" not in str(w.message) + for w in record.list) + @requires_rdkit class TestRDKitFunctions(object): From 0ce32710ef8f9b3b56e91f1d4df456dbfd429c8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 11 May 2021 23:55:13 +0200 Subject: [PATCH 23/47] reorder rdkit atoms to match mdanalysis --- package/MDAnalysis/converters/RDKit.py | 5 +++++ .../MDAnalysisTests/converters/test_rdkit.py | 19 +++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 122ff1db446..d8931b7af3e 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -439,6 +439,11 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): warnings.warn("Could not sanitize molecule: " f"failed during step {err!r}") + # reorder atoms to match MDAnalysis + order = np.argsort([atom.GetIntProp("_MDAnalysis_index") + for atom in mol.GetAtoms()]) + mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) + return mol diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index f93600b3b20..9e5ab09cb2d 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -204,13 +204,12 @@ def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) umol = sel.convert_to.rdkit(NoImplicit=False) atom = umol.GetAtomWithIdx(atom_index) - mda_index = atom.GetIntProp("_MDAnalysis_index") mi = atom.GetMonomerInfo() for mda_attr, rd_attr in RDATTRIBUTES.items(): rd_value = getattr(mi, "Get%s" % rd_attr)() if hasattr(sel, mda_attr): - mda_value = getattr(sel, mda_attr)[mda_index] + mda_value = getattr(sel, mda_attr)[atom_index] if mda_attr == "names": rd_value = rd_value.strip() assert rd_value == mda_value @@ -303,10 +302,9 @@ def test_other_attributes(self, mol2, idx): mol = mol2.atoms.convert_to("RDKIT") rdatom = mol.GetAtomWithIdx(idx) rdprops = rdatom.GetPropsAsDict() - mda_idx = int(rdprops["_MDAnalysis_index"]) for prop in ["charge", "segid", "type"]: rdprop = rdprops["_MDAnalysis_%s" % prop] - mdaprop = getattr(mol2.atoms[mda_idx], prop) + mdaprop = getattr(mol2.atoms[idx], prop) assert rdprop == mdaprop @pytest.mark.parametrize("sel_str", [ @@ -317,17 +315,13 @@ def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) mol = ag.convert_to.rdkit(NoImplicit=False) expected = [i for i in range(len(ag))] - indices = sorted([a.GetIntProp("_MDAnalysis_index") - for a in mol.GetAtoms()]) + indices = [a.GetIntProp("_MDAnalysis_index") for a in mol.GetAtoms()] assert_equal(indices, expected) def test_assign_coordinates(self, pdb): mol = pdb.atoms.convert_to.rdkit(NoImplicit=False) positions = mol.GetConformer().GetPositions() - indices = sorted(mol.GetAtoms(), - key=lambda a: a.GetIntProp("_MDAnalysis_index")) - indices = [a.GetIdx() for a in indices] - assert_almost_equal(positions[indices], pdb.atoms.positions) + assert_almost_equal(positions, pdb.atoms.positions) def test_assign_stereochemistry(self, mol2): umol = mol2.atoms.convert_to("RDKIT") @@ -342,10 +336,7 @@ def test_trajectory_coords(self): for ts in u.trajectory: mol = u.atoms.convert_to("RDKIT") positions = mol.GetConformer().GetPositions() - indices = sorted(mol.GetAtoms(), - key=lambda a: a.GetIntProp("_MDAnalysis_index")) - indices = [a.GetIdx() for a in indices] - assert_almost_equal(positions[indices], ts.positions) + assert_almost_equal(positions, ts.positions) def test_nan_coords(self): u = mda.Universe.from_smiles("CCO") From 7477e7ff7aad75a1539ff9a744b8c94a629cf462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 12 May 2021 02:24:57 +0200 Subject: [PATCH 24/47] change setting and transferring atom properties --- package/MDAnalysis/converters/RDKit.py | 111 ++++++++++++------ .../MDAnalysisTests/converters/test_rdkit.py | 71 +++++------ 2 files changed, 106 insertions(+), 76 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index d8931b7af3e..8d251a64374 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -72,7 +72,6 @@ import numpy as np from ..exceptions import NoDataError -from ..topology.guessers import guess_atom_element from ..core.topologyattrs import _TOPOLOGY_ATTRS from ..coordinates import memory from ..coordinates import base @@ -316,9 +315,9 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, # assign coordinates conf = Chem.Conformer(mol.GetNumAtoms()) for atom in mol.GetAtoms(): - idx = atom.GetIntProp("_MDAnalysis_index") + idx = atom.GetIdx() xyz = ag.positions[idx].astype(float) - conf.SetAtomPosition(atom.GetIdx(), xyz) + conf.SetAtomPosition(idx, xyz) mol.AddConformer(conf) # assign R/S to atoms and Z/E to bonds Chem.AssignStereochemistryFrom3D(mol) @@ -432,6 +431,10 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): # infer bond orders and formal charges from the connectivity _infer_bo_and_charges(mol) mol = _standardize_patterns(mol, max_iter) + # reorder atoms to match MDAnalysis + order = np.argsort([atom.GetIntProp("_MDAnalysis_index") + for atom in mol.GetAtoms()]) + mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) # sanitize if possible err = Chem.SanitizeMol(mol, catchErrors=True) @@ -439,11 +442,6 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): warnings.warn("Could not sanitize molecule: " f"failed during step {err!r}") - # reorder atoms to match MDAnalysis - order = np.argsort([atom.GetIntProp("_MDAnalysis_index") - for atom in mol.GetAtoms()]) - mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) - return mol @@ -496,14 +494,40 @@ def _add_mda_attr_to_rdkit(attr, value, mi): getattr(mi, "Set%s" % rdattr)(value) +def _set_str_prop(atom, attr, value): + atom.SetProp(attr, value) + +def _set_float_prop(atom, attr, value): + atom.SetDoubleProp(attr, value) + +def _set_np_float_prop(atom, attr, value): + atom.SetDoubleProp(attr, float(value)) + +def _set_int_prop(atom, attr, value): + atom.SetIntProp(attr, value) + +def _set_np_int_prop(atom, attr, value): + atom.SetIntProp(attr, int(value)) + +_atom_property_dispatcher = { + str: _set_str_prop, + float: _set_float_prop, + np.float32: _set_np_float_prop, + np.float64: _set_np_float_prop, + int: _set_int_prop, + np.int8: _set_np_int_prop, + np.int16: _set_np_int_prop, + np.int32: _set_np_int_prop, + np.int64: _set_np_int_prop, + np.uint8: _set_np_int_prop, + np.uint16: _set_np_int_prop, + np.uint32: _set_np_int_prop, + np.uint64: _set_np_int_prop, +} + def _set_atom_property(atom, attr, value): """Saves any attribute and value into an RDKit atom property""" - if isinstance(value, (float, np.floating)): - atom.SetDoubleProp(attr, float(value)) - elif isinstance(value, (int, np.integer)): - atom.SetIntProp(attr, int(value)) - else: - atom.SetProp(attr, value) + _atom_property_dispatcher[type(value)](atom, attr, value) def _infer_bo_and_charges(mol): @@ -690,11 +714,19 @@ def _standardize_patterns(mol, max_iter=200): fragments.append(reactant) # recombine fragments - mol = fragments.pop(0) + newmol = fragments.pop(0) for fragment in fragments: - mol = Chem.CombineMols(mol, fragment) + newmol = Chem.CombineMols(newmol, fragment) - return mol + # reassign all properties + _transfer_properties(mol, newmol) + + # fix bonds with "crossed" stereo + for bond in newmol.GetBonds(): + if bond.GetStereo() == Chem.BondStereo.STEREOANY: + bond.SetStereo(Chem.BondStereo.STEREONONE) + + return newmol def _run_reaction(reaction, reactant): @@ -724,7 +756,7 @@ def _run_reaction(reaction, reactant): if products: product = products[0][0] # map back atom properties from the reactant to the product - _reassign_props_after_reaction(reactant, product) + _reassign_index_after_reaction(reactant, product) # apply the next reaction to the product reactant = product else: @@ -903,23 +935,34 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): "reasonable number of iterations") -def _reassign_props_after_reaction(reactant, product): - """Maps back atomic properties from the reactant to the product. +def _reassign_index_after_reaction(reactant, product): + """Maps back MDAnalysis index from the reactant to the product. The product molecule is modified inplace. """ - for atom in product.GetAtoms(): - try: - atom.GetIntProp("old_mapno") - except KeyError: - pass - else: - atom.ClearProp("old_mapno") + prop = "_MDAnalysis_index" + if reactant.GetAtoms()[0].HasProp(prop): + for atom in product.GetAtoms(): idx = atom.GetUnsignedProp("react_atom_idx") old_atom = reactant.GetAtomWithIdx(idx) - for prop, value in old_atom.GetPropsAsDict().items(): - _set_atom_property(atom, prop, value) - # fix bonds with "crossed" stereo - for bond in atom.GetBonds(): - if bond.GetStereo() == Chem.BondStereo.STEREOANY: - bond.SetStereo(Chem.BondStereo.STEREONONE) - atom.ClearProp("react_atom_idx") + value = old_atom.GetIntProp(prop) + _set_atom_property(atom, prop, value) + + +def _transfer_properties(mol, newmol): + """Transfer properties between two RDKit molecules. Requires the + `_MDAnalysis_index` property to be present. Modifies `newmol` inplace. + """ + atoms = mol.GetAtoms() + if atoms[0].HasProp("_MDAnalysis_index"): + props = {} + for atom in atoms: + ix = atom.GetIntProp("_MDAnalysis_index") + p = atom.GetPropsAsDict() + # remove properties assigned during reactions + p.pop("old_mapno", None) + p.pop("react_atom_idx", None) + props[ix] = p + for atom in newmol.GetAtoms(): + ix = atom.GetIntProp("_MDAnalysis_index") + for attr, value in props[ix].items(): + _set_atom_property(atom, attr, value) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 9e5ab09cb2d..5a096d3ed09 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -43,7 +43,7 @@ _standardize_patterns, _rebuild_conjugated_bonds, _set_atom_property, - _reassign_props_after_reaction, + _reassign_index_after_reaction, ) except ImportError: pass @@ -82,26 +82,10 @@ def dummy_product(): mol.AddAtom(atom) return mol - def dummy_product_nomap(): - mol = Chem.RWMol() - atom = Chem.Atom(1) - atom.SetUnsignedProp("react_atom_idx", 0) - mol.AddAtom(atom) - return mol - - def dummy_reactant_noprops(): - mol = Chem.RWMol() - atom = Chem.Atom(1) - mol.AddAtom(atom) - return mol - def dummy_reactant(): mol = Chem.RWMol() atom = Chem.Atom(1) - atom.SetProp("foo", "bar") atom.SetIntProp("_MDAnalysis_index", 1) - atom.SetDoubleProp("_MDAnalysis_charge", 4.2) - atom.SetProp("_MDAnalysis_type", "C.3") mol.AddAtom(atom) return mol @@ -515,33 +499,36 @@ def test_set_atom_property(self, attr, value, getter): _set_atom_property(atom, prop, value) assert getattr(atom, getter)(prop) == value - @pytest.mark.parametrize("rdmol, product, name", [ - ("dummy_reactant", "dummy_product", "props"), - ("dummy_reactant_noprops", "dummy_product", "noprops"), - ("dummy_reactant", "dummy_product_nomap", "nomap"), + @pytest.mark.parametrize("rdmol, product", [ + ("dummy_reactant", "dummy_product"), ], indirect=["rdmol", "product"]) - def test_reassign_props_after_reaction(self, rdmol, product, name): - _reassign_props_after_reaction(rdmol, product) + def test_reassign_index_after_reaction(self, rdmol, product): + _reassign_index_after_reaction(rdmol, product) atom = product.GetAtomWithIdx(0) - if name == "props": - assert atom.GetProp("foo") == "bar" - assert atom.GetIntProp("_MDAnalysis_index") == 1 - assert atom.GetDoubleProp("_MDAnalysis_charge") == 4.2 - assert atom.GetProp("_MDAnalysis_type") == "C.3" - with pytest.raises(KeyError, match="old_mapno"): - atom.GetIntProp("old_mapno") - with pytest.raises(KeyError, match="react_atom_idx"): - atom.GetUnsignedProp("react_atom_idx") - elif name == "noprops": - with pytest.raises(KeyError, match="old_mapno"): - atom.GetIntProp("old_mapno") - with pytest.raises(KeyError, match="react_atom_idx"): - atom.GetUnsignedProp("react_atom_idx") - elif name == "nomap": - with pytest.raises(KeyError, match="react_atom_idx"): - atom.GetUnsignedProp("react_atom_idx") - with pytest.raises(KeyError, match="_MDAnalysis_index"): - atom.GetIntProp("_MDAnalysis_index") + assert atom.GetIntProp("_MDAnalysis_index") == 1 + + @pytest.mark.parametrize("smi", [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + ]) + def test_transfer_properties(self, smi): + mol = Chem.MolFromSmiles(smi) + mol = self.mol_to_template(mol) + old = {} + for atom in mol.GetAtoms(): + ix = atom.GetIdx() + atom.SetIntProp("_MDAnalysis_index", ix) + atom.SetProp("dummy", f"foo_{ix}") + old[ix] = {"_MDAnalysis_index": ix, "dummy": f"foo_{ix}"} + _infer_bo_and_charges(mol) + newmol = _standardize_patterns(mol) + new = {} + for a in newmol.GetAtoms(): + ix = a.GetIntProp("_MDAnalysis_index") + new[ix] = {"_MDAnalysis_index": ix, + "dummy": a.GetProp("dummy")} + assert new == old @pytest.mark.parametrize("input_type, input_str", [ ("smi", "c1ccc(cc1)-c1ccccc1-c1ccccc1"), From df4feb4781cf999d74f687e96d14b41b8ae64f2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 12 May 2021 12:27:51 +0200 Subject: [PATCH 25/47] fix tests + pep8 --- package/MDAnalysis/converters/RDKit.py | 13 +++++++++++-- testsuite/MDAnalysisTests/converters/test_rdkit.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 8d251a64374..70ff046fbcb 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -497,18 +497,27 @@ def _add_mda_attr_to_rdkit(attr, value, mi): def _set_str_prop(atom, attr, value): atom.SetProp(attr, value) + def _set_float_prop(atom, attr, value): atom.SetDoubleProp(attr, value) + def _set_np_float_prop(atom, attr, value): atom.SetDoubleProp(attr, float(value)) + def _set_int_prop(atom, attr, value): atom.SetIntProp(attr, value) + def _set_np_int_prop(atom, attr, value): atom.SetIntProp(attr, int(value)) + +def _ignore_prop(atom, attr, value): + pass + + _atom_property_dispatcher = { str: _set_str_prop, float: _set_float_prop, @@ -527,7 +536,7 @@ def _set_np_int_prop(atom, attr, value): def _set_atom_property(atom, attr, value): """Saves any attribute and value into an RDKit atom property""" - _atom_property_dispatcher[type(value)](atom, attr, value) + _atom_property_dispatcher.get(type(value), _ignore_prop)(atom, attr, value) def _infer_bo_and_charges(mol): @@ -863,7 +872,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # increment the charge else: term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) - # common to all cases: + # common to all cases: # [*-]-*=*-[R] --> *=*-*=[R] # increment the charge and switch single and double bonds a = mol.GetAtomWithIdx(anion1) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 5a096d3ed09..c61fb48cec0 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -547,7 +547,7 @@ def test_transfer_properties(self, smi): ("smi", "O=C([C@H](CC1=CNC=N1)N)O"), ("smi", "O=C([C@H](CC1=CN=CN1)N)O"), ("smi", "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]"), - # fixes from PR #??? + # fixes from PR #3044 ("smi", "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1"), ("smi", "[O-][n+]1cccnc1"), ("smi", "C[n+]1ccccc1"), From 359fdf4526904bbb915c76e808ed90050419cb01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 12 May 2021 13:26:30 +0200 Subject: [PATCH 26/47] fix tests --- package/MDAnalysis/converters/RDKit.py | 9 ++++----- .../MDAnalysisTests/converters/test_rdkit.py | 20 ++++++++++++++++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 70ff046fbcb..339e413d3bf 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -966,12 +966,11 @@ def _transfer_properties(mol, newmol): props = {} for atom in atoms: ix = atom.GetIntProp("_MDAnalysis_index") - p = atom.GetPropsAsDict() - # remove properties assigned during reactions - p.pop("old_mapno", None) - p.pop("react_atom_idx", None) - props[ix] = p + props[ix] = atom.GetPropsAsDict() for atom in newmol.GetAtoms(): ix = atom.GetIntProp("_MDAnalysis_index") for attr, value in props[ix].items(): _set_atom_property(atom, attr, value) + # remove properties assigned during reactions + atom.ClearProp("old_mapno") + atom.ClearProp("react_atom_idx") diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index c61fb48cec0..0281de4cf58 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -486,8 +486,14 @@ def test_standardize_patterns(self, smi, out): @pytest.mark.parametrize("attr, value, getter", [ ("index", 42, "GetIntProp"), + ("index", np.int8(42), "GetIntProp"), + ("index", np.int16(42), "GetIntProp"), ("index", np.int32(42), "GetIntProp"), ("index", np.int64(42), "GetIntProp"), + ("index", np.uint8(42), "GetIntProp"), + ("index", np.uint16(42), "GetIntProp"), + ("index", np.uint32(42), "GetIntProp"), + ("index", np.uint64(42), "GetIntProp"), ("charge", 4.2, "GetDoubleProp"), ("charge", np.float32(4.2), "GetDoubleProp"), ("charge", np.float64(4.2), "GetDoubleProp"), @@ -499,6 +505,11 @@ def test_set_atom_property(self, attr, value, getter): _set_atom_property(atom, prop, value) assert getattr(atom, getter)(prop) == value + def test_ignore_prop(self): + atom = Chem.Atom(1) + _set_atom_property(atom, "foo", {"bar": "baz"}) + assert "foo" not in atom.GetPropsAsDict().items() + @pytest.mark.parametrize("rdmol, product", [ ("dummy_reactant", "dummy_product"), ], indirect=["rdmol", "product"]) @@ -528,6 +539,9 @@ def test_transfer_properties(self, smi): ix = a.GetIntProp("_MDAnalysis_index") new[ix] = {"_MDAnalysis_index": ix, "dummy": a.GetProp("dummy")} + props = a.GetPropsAsDict().keys() + assert "old_mapno" not in props + assert "react_atom_idx" not in props assert new == old @pytest.mark.parametrize("input_type, input_str", [ @@ -634,9 +648,9 @@ def test_warn_conjugated_max_iter(self): _rebuild_conjugated_bonds(mol, 2) @pytest.mark.parametrize("smi", [ - "[Li+]", "[Na+]", "[K+]", "[Ag+]", - "[Mg+2]", "[Ca+2]", "[Cu+2]", "[Zn+2]", "[Fe+2]", - "[Al+3]", + "[Li+]", "[Na+]", "[K+]", "[Rb+]", "[Ag+]", "[Cs+]", + "[Mg+2]", "[Ca+2]", "[Cu+2]", "[Zn+2]", "[Sr+2]", "[Ba+2]", + "[Al+3]", "[Fe+2]", "[Cl-]", "[O-2]", "[Na+].[Cl-]", From e3585043a7e09e950c12d51f61474617bd16c475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 May 2021 22:41:40 +0200 Subject: [PATCH 27/47] add more comments and UGM presentation --- package/MDAnalysis/converters/RDKit.py | 55 ++++++++++++++++++++------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 339e413d3bf..d3cfc051c68 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -252,6 +252,11 @@ class RDKitConverter(base.ConverterBase): cache since the arguments given are different. You can pass a ``cache=False`` argument to the converter to bypass the caching system. + To get a better understanding of how the converter works under the hood, + please refer to the following RDKit UGM presentation: + - `Video (4:55 to 8:05) `__ + - Slides `__ + .. versionadded:: 2.0.0 @@ -478,15 +483,18 @@ def _add_mda_attr_to_rdkit(attr, value, mi): if attr == "names": # RDKit needs the name to be properly formatted for a # PDB file (1 letter elements start at col 14) - name = re.findall(r'(\D+|\d+)', value) + name = re.findall(r'(\D+|\d+)', value) # splits alphabet from numeric + # alpha-numeric name if len(name) == 2: + # only two characters if len(value) == 2: value = f" {value} " + # more than two --> cut to 4 characters else: symbol, number = name value = f"{symbol + number:>4.4}" + # no number in the name else: - # no number in the name value = f" {name[0]:<3.3}" # set attribute value in RDKit MonomerInfo @@ -563,10 +571,15 @@ def _infer_bo_and_charges(mol): R-C(-O)-O the first oxygen read will receive a double bond and the other one will be charged. It will also affect more complex conjugated systems. """ + # sort atoms according to their NUE atoms = sorted([a for a in mol.GetAtoms() if a.GetAtomicNum() > 1], reverse=True, key=lambda a: _get_nb_unpaired_electrons(a)[0]) + # charges that should be assigned to monatomic cations + # structure --> atomic number : formal charge + # anion charges are directly handled by the code using the typical valence + # of the atom MONATOMIC_CATION_CHARGES = { 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, @@ -757,19 +770,27 @@ def _run_reaction(reaction, reactant): product : rdkit.Chem.rdchem.Mol The final product of the reaction """ + # repeat the reaction until all matching moeities have been transformed + # note: this loop is meant to be ended by a `break` statement + # but let's avoid using `while` loops just in case for n in range(reactant.GetNumAtoms()): reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) products = reaction.RunReactants((reactant,)) - # only keep the first product if products: + # structure: tuple[tuple[mol]] + # length of 1st tuple: number of matches in reactant + # length of 2nd tuple: number of products yielded by the reaction + # if there's no match in reactant, returns an empty tuple + # here we have at least one match, and the reaction used yield + # a single product hence `products[0][0]` product = products[0][0] # map back atom properties from the reactant to the product _reassign_index_after_reaction(reactant, product) # apply the next reaction to the product reactant = product + # exit the loop if there was nothing to transform else: - # exit the loop if there's no product break reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) @@ -805,26 +826,36 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): The molecule to transform, modified inplace max_iter : int Maximum number of iterations performed by the function + + Notes + ----- + The molecule is modified inplace """ mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) + # pattern used to find problematic conjugated bonds + # there's usually an even number of matches for this pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") + # pattern used to finish fixing a series of conjugated bonds base_end_pattern = Chem.MolFromSmarts( "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") + # used when there's an odd number of matches for `pattern` odd_end_pattern = Chem.MolFromSmarts( "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," "$([S;D4;v4]-[O-])]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) + # nothing to standardize if n_matches == 0: - # nothing to standardize return + # single match (unusual) elif n_matches == 1: # as a last resort, the only way to standardize is to find a nitrogen # that can accept a double bond and a positive charge # or a carbonyl that will become an enolate end_pattern = odd_end_pattern + # at least 2 matches else: # give priority to base end pattern and then deal with the odd one if # necessary @@ -886,23 +917,23 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): mol.UpdatePropertyCache(strict=False) continue - # shorten the anion-anion pattern from n to n-1 + # switch the position of the charge and the double bond matches = mol.GetSubstructMatches(pattern) if matches: # check if we haven't already transformed this triplet for match in matches: - # sort the indices for the comparison + # store order-independent atom indices g = set(match) + # already transformed --> try the next one if g in backtrack: - # already transformed continue + # add to backtracking and start the switch else: - # take the first one that hasn't been tried anion, a1, a2 = match backtrack.append(g) break + # already performed all changes else: - # already performed all changes if backtrack_cycles == 1: # might be stuck because there were 2 "odd" end patterns # misqualified as a single base one @@ -917,12 +948,12 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): backtrack = [set(match)] backtrack_cycles += 1 - # charges + # switch charges a = mol.GetAtomWithIdx(anion) a.SetFormalCharge(a.GetFormalCharge() + 1) a = mol.GetAtomWithIdx(a2) a.SetFormalCharge(a.GetFormalCharge() - 1) - # bonds + # switch bond orders b = mol.GetBondBetweenAtoms(anion, a1) b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) b = mol.GetBondBetweenAtoms(a1, a2) From 8fbc8520df7b398195a0620a90f14d77788bfe65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 May 2021 23:04:15 +0200 Subject: [PATCH 28/47] fix links --- package/MDAnalysis/converters/RDKit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index d3cfc051c68..bc8e822e7b1 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -254,8 +254,9 @@ class RDKitConverter(base.ConverterBase): To get a better understanding of how the converter works under the hood, please refer to the following RDKit UGM presentation: - - `Video (4:55 to 8:05) `__ - - Slides `__ + + * `Video (4:55 to 8:05) `__ + * `Slides `__ .. versionadded:: 2.0.0 From 2c3e2fab60210a205c9440e817eeb6bccb8890dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 24 May 2021 18:52:37 +0200 Subject: [PATCH 29/47] add test for new atom reordering --- .../MDAnalysisTests/converters/test_rdkit.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 0281de4cf58..667e8e306cf 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -22,6 +22,7 @@ # import copy +from io import StringIO import pytest import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element @@ -662,3 +663,20 @@ def test_ions(self, smi): mol = _standardize_patterns(mol) Chem.SanitizeMol(mol) assert mol.HasSubstructMatch(ref) and ref.HasSubstructMatch(mol) + + @pytest.mark.parametrize("smi", [ + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + "O=S(C)(C)=NC", + ]) + def test_reorder_atoms(self, smi): + mol = Chem.MolFromSmiles(smi) + mol = Chem.AddHs(mol) + # remove bond order and charges info + pdb = Chem.MolToPDBBlock(mol) + u = mda.Universe(StringIO(pdb), format="PDB") + # atoms are reordered during infering, and will be reordered back to + # their original order with Chem.RenumberAtoms + mu = u.atoms.convert_to.rdkit() + values = [a.GetSymbol() for a in mu.GetAtoms()] + expected = [a.GetSymbol() for a in mol.GetAtoms()] + assert values == expected From 9ba698d9666a7512c22537134b486917079345ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 24 May 2021 18:54:54 +0200 Subject: [PATCH 30/47] remove unnecessary sorting when using rdkit-based guessers or selection --- package/MDAnalysis/core/selection.py | 8 +++----- package/MDAnalysis/topology/guessers.py | 9 +++------ 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py index d72ec4f6c03..f28c9c0d1fd 100644 --- a/package/MDAnalysis/core/selection.py +++ b/package/MDAnalysis/core/selection.py @@ -660,12 +660,10 @@ def apply(self, group): raise ValueError(f"{self.pattern!r} is not a valid SMARTS query") mol = group.convert_to("RDKIT") matches = mol.GetSubstructMatches(pattern, useChirality=True) - # convert rdkit indices to mdanalysis' - indices = [ - mol.GetAtomWithIdx(idx).GetIntProp("_MDAnalysis_index") - for match in matches for idx in match] + # flatten all matches and remove duplicated indices + indices = np.unique([idx for match in matches for idx in match]) # create boolean mask for atoms based on index - mask = np.in1d(range(group.n_atoms), np.unique(indices)) + mask = np.in1d(range(group.n_atoms), indices) return group[mask] diff --git a/package/MDAnalysis/topology/guessers.py b/package/MDAnalysis/topology/guessers.py index b85842bc439..948b91fa7bf 100644 --- a/package/MDAnalysis/topology/guessers.py +++ b/package/MDAnalysis/topology/guessers.py @@ -494,9 +494,7 @@ def guess_aromaticities(atomgroup): .. versionadded:: 2.0.0 """ mol = atomgroup.convert_to("RDKIT") - atoms = sorted(mol.GetAtoms(), - key=lambda a: a.GetIntProp("_MDAnalysis_index")) - return np.array([atom.GetIsAromatic() for atom in atoms]) + return np.array([atom.GetIsAromatic() for atom in mol.GetAtoms()]) def guess_gasteiger_charges(atomgroup): @@ -518,7 +516,6 @@ def guess_gasteiger_charges(atomgroup): mol = atomgroup.convert_to("RDKIT") from rdkit.Chem.rdPartialCharges import ComputeGasteigerCharges ComputeGasteigerCharges(mol, throwOnParamFailure=True) - atoms = sorted(mol.GetAtoms(), - key=lambda a: a.GetIntProp("_MDAnalysis_index")) - return np.array([atom.GetDoubleProp("_GasteigerCharge") for atom in atoms], + return np.array([atom.GetDoubleProp("_GasteigerCharge") + for atom in mol.GetAtoms()], dtype=np.float32) From 3f6ff64dc0c512e793783ad32b56bfb3526b2471 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Feb 2022 20:21:05 +0100 Subject: [PATCH 31/47] add missing test --- testsuite/MDAnalysisTests/converters/test_rdkit.py | 1 + 1 file changed, 1 insertion(+) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index d260551e1d4..ca5c10eb760 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -571,6 +571,7 @@ def test_transfer_properties(self, smi): ("smi", "[O-][n+]1onc2ccccc21"), ("smi", "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1"), ("smi", "[O-]c1ccccc1"), + ("smi", "[O-]C=CC=CCC=CC=[N+](C)C"), # test amino acids ("aa", "A"), ("aa", "G"), From 58725594aed4540079abb617bf5018c9fe37e66f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 17 Feb 2022 18:09:10 +0100 Subject: [PATCH 32/47] add bond stereo test --- testsuite/MDAnalysisTests/converters/test_rdkit.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index ca5c10eb760..372a5bc62e5 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -673,3 +673,13 @@ def test_reorder_atoms(self, smi): values = [a.GetSymbol() for a in mu.GetAtoms()] expected = [a.GetSymbol() for a in mol.GetAtoms()] assert values == expected + + def test_convert_stereoany_to_stereonone(self): + mol = Chem.MolFromSmiles("FC(Cl)=C(Br)I") + bond = next(b for b in mol.GetBonds() + if b.GetBondTypeAsDouble() == 2) + bond.SetStereo(Chem.BondStereo.STEREOANY) + newmol = _standardize_patterns(mol) + bond = next(b for b in newmol.GetBonds() + if b.GetBondTypeAsDouble() == 2) + assert bond.GetStereo() == Chem.BondStereo.STEREONONE From 428d341f48e84e9a0b3b22dcd6ae745021a3d8b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 7 Mar 2022 20:08:51 +0100 Subject: [PATCH 33/47] more docs and comments --- package/MDAnalysis/converters/RDKit.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 3e6a8ff7f51..3cc6f684cfa 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -53,7 +53,7 @@ infers the structures of approximately 90% of the `ChEMBL27`_ dataset. Work is currently ongoing on further improving this and updates to the converter are expected in future releases of MDAnalysis. - Please see `Pull Request #3044`_ for more details. + Please see `Pull Request #3044`_ and `Issue #3339`_ for more details. @@ -77,6 +77,7 @@ .. _`ChEMBL27`: https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_27/ .. _`Pull Request #3044`: https://github.com/MDAnalysis/mdanalysis/pull/3044 +.. _`Issue #3339`: https://github.com/MDAnalysis/mdanalysis/issues/3339 """ @@ -261,6 +262,10 @@ class RDKitConverter(base.ConverterBase): cache since the arguments given are different. You can pass a ``cache=False`` argument to the converter to bypass the caching system. + The ``_MDAnalysis_index`` property of the resulting molecule corresponds + to the index of the specific :class:`~MDAnalysis.core.groups.AtomGroup` + that was converted, which may not always match the ``index`` property. + To get a better understanding of how the converter works under the hood, please refer to the following RDKit UGM presentation: @@ -270,6 +275,11 @@ class RDKitConverter(base.ConverterBase): .. versionadded:: 2.0.0 + .. versionchanged:: 2.2.0 + Improved the accuracy of the converter. Atoms in the resulting molecule + follow the same order as in the AtomGroup. Fixed a + ``SanitizationError`` when disabling the bond order inferring step. + """ lib = 'RDKIT' @@ -440,7 +450,8 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): # infer bond orders and formal charges from the connectivity _infer_bo_and_charges(mol) mol = _standardize_patterns(mol, max_iter) - # reorder atoms to match MDAnalysis + # reorder atoms to match MDAnalysis, since the reactions from + # _standardize_patterns will mess up the original order order = np.argsort([atom.GetIntProp("_MDAnalysis_index") for atom in mol.GetAtoms()]) mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) From 50de418b8df968660e5ec0cf507f25efdc32edc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 7 Mar 2022 20:10:39 +0100 Subject: [PATCH 34/47] use PDBWriter.deduce_pdb_atom_name instead of custom code --- package/MDAnalysis/converters/RDKit.py | 36 +++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 3cc6f684cfa..1b2b366cdbd 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -82,9 +82,9 @@ """ import warnings -import re import copy from functools import lru_cache +from io import StringIO import numpy as np @@ -92,6 +92,7 @@ from ..core.topologyattrs import _TOPOLOGY_ATTRS from ..coordinates import memory from ..coordinates import base +from ..coordinates.PDB import PDBWriter try: from rdkit import Chem @@ -122,6 +123,9 @@ PERIODIC_TABLE = Chem.GetPeriodicTable() +_deduce_PDB_atom_name = PDBWriter(StringIO())._deduce_PDB_atom_name + + class RDKitReader(memory.MemoryReader): """Coordinate reader for RDKit. @@ -398,6 +402,13 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): for attr in RDATTRIBUTES.keys(): if hasattr(ag, attr): pdb_attrs[attr] = getattr(ag, attr) + resnames = pdb_attrs.get("resnames", None) + if resnames is None: + def get_resname(idx): + return "" + else: + def get_resname(idx): + return resnames[idx] other_attrs = {} for attr in ["charges", "segids", "types"]: @@ -416,7 +427,7 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): # add PDB-like properties mi = Chem.AtomPDBResidueInfo() for attr, values in pdb_attrs.items(): - _add_mda_attr_to_rdkit(attr, values[i], mi) + _add_mda_attr_to_rdkit(attr, values[i], mi, get_resname(i)) rdatom.SetMonomerInfo(mi) # other properties for attr in other_attrs.keys(): @@ -479,7 +490,7 @@ def set_converter_cache_size(maxsize): atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__) -def _add_mda_attr_to_rdkit(attr, value, mi): +def _add_mda_attr_to_rdkit(attr, value, mi, resname=""): """Converts an MDAnalysis atom attribute into the RDKit equivalent and stores it into an RDKit :class:`~rdkit.Chem.rdchem.AtomPDBResidueInfo`. @@ -491,26 +502,15 @@ def _add_mda_attr_to_rdkit(attr, value, mi): Attribute value as found in the AtomGroup mi : rdkit.Chem.rdchem.AtomPDBResidueInfo MonomerInfo object that will store the relevant atom attributes + resname : str + Residue name of the atom, if available """ if isinstance(value, np.generic): # convert numpy types to python standard types value = value.item() if attr == "names": - # RDKit needs the name to be properly formatted for a - # PDB file (1 letter elements start at col 14) - name = re.findall(r'(\D+|\d+)', value) # splits alphabet from numeric - # alpha-numeric name - if len(name) == 2: - # only two characters - if len(value) == 2: - value = f" {value} " - # more than two --> cut to 4 characters - else: - symbol, number = name - value = f"{symbol + number:>4.4}" - # no number in the name - else: - value = f" {name[0]:<3.3}" + # RDKit needs the name to be properly formatted for a PDB file + value = _deduce_PDB_atom_name(value, resname) # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] From eec6b1d56cf26f16e0bf7896e267e4add0320a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 7 Mar 2022 20:11:35 +0100 Subject: [PATCH 35/47] remove unnecessary bond stereo assignment --- package/MDAnalysis/converters/RDKit.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 1b2b366cdbd..bb063962f1e 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -758,11 +758,6 @@ def _standardize_patterns(mol, max_iter=200): # reassign all properties _transfer_properties(mol, newmol) - # fix bonds with "crossed" stereo - for bond in newmol.GetBonds(): - if bond.GetStereo() == Chem.BondStereo.STEREOANY: - bond.SetStereo(Chem.BondStereo.STEREONONE) - return newmol From 393d1419e48e4b7ef6150aae833b3060c2dc376c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 11 Mar 2022 13:17:51 +0100 Subject: [PATCH 36/47] remove unused bond test --- testsuite/MDAnalysisTests/converters/test_rdkit.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 372a5bc62e5..ca5c10eb760 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -673,13 +673,3 @@ def test_reorder_atoms(self, smi): values = [a.GetSymbol() for a in mu.GetAtoms()] expected = [a.GetSymbol() for a in mol.GetAtoms()] assert values == expected - - def test_convert_stereoany_to_stereonone(self): - mol = Chem.MolFromSmiles("FC(Cl)=C(Br)I") - bond = next(b for b in mol.GetBonds() - if b.GetBondTypeAsDouble() == 2) - bond.SetStereo(Chem.BondStereo.STEREOANY) - newmol = _standardize_patterns(mol) - bond = next(b for b in newmol.GetBonds() - if b.GetBondTypeAsDouble() == 2) - assert bond.GetStereo() == Chem.BondStereo.STEREONONE From a2886e5e5ed8fe8bd845bcb158e939b0f6ac3af8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Mar 2022 17:57:06 +0100 Subject: [PATCH 37/47] add _MDAnalysis_name atom property --- package/MDAnalysis/converters/RDKit.py | 13 +++++++++---- testsuite/MDAnalysisTests/converters/test_rdkit.py | 8 ++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index bb063962f1e..fdaa8d34e91 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -195,6 +195,7 @@ class RDKitConverter(base.ConverterBase): | icodes | atom.GetMonomerInfo().GetInsertionCode() | +-----------------------+-------------------------------------------+ | names | atom.GetMonomerInfo().GetName() | + | | atom.GetProp("_MDAnalysis_name") | +-----------------------+-------------------------------------------+ | occupancies | atom.GetMonomerInfo().GetOccupancy() | +-----------------------+-------------------------------------------+ @@ -280,9 +281,13 @@ class RDKitConverter(base.ConverterBase): .. versionadded:: 2.0.0 .. versionchanged:: 2.2.0 - Improved the accuracy of the converter. Atoms in the resulting molecule - follow the same order as in the AtomGroup. Fixed a - ``SanitizationError`` when disabling the bond order inferring step. + See the full changelog for more details: + + - Improved the accuracy of the converter + - Atoms in the resulting molecule follow the same order as in the + AtomGroup + - Fixed a ``SanitizationError`` when disabling the bond order inferring + step """ @@ -411,7 +416,7 @@ def get_resname(idx): return resnames[idx] other_attrs = {} - for attr in ["charges", "segids", "types"]: + for attr in ["charges", "segids", "types", "names"]: if hasattr(ag, attr): other_attrs[attr] = getattr(ag, attr) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index ca5c10eb760..883fd1158b3 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -673,3 +673,11 @@ def test_reorder_atoms(self, smi): values = [a.GetSymbol() for a in mu.GetAtoms()] expected = [a.GetSymbol() for a in mol.GetAtoms()] assert values == expected + + def test_pdb_names(self): + u = mda.Universe(PDB_helix) + mol = u.atoms.convert_to.rdkit() + names = u.atoms.names + rd_names = np.array([a.GetProp("_MDAnalysis_name") + for a in mol.GetAtoms()]) + assert (names == rd_names).all() From 055692bf0941ad9d414eeec49e729ae849289ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Mar 2022 19:05:50 +0100 Subject: [PATCH 38/47] add bond stereo test --- testsuite/MDAnalysisTests/converters/test_rdkit.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 883fd1158b3..7434ab5e2f6 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -681,3 +681,15 @@ def test_pdb_names(self): rd_names = np.array([a.GetProp("_MDAnalysis_name") for a in mol.GetAtoms()]) assert (names == rd_names).all() + + @pytest.mark.parametrize("smi", [ + r"F/C(Br)=C(Cl)/I", + r"F\C(Br)=C(Cl)\I", + "F-C(Br)=C(Cl)-I", + ]) + def test_bond_stereo_not_stereoany(self, smi): + u = mda.Universe.from_smiles(smi) + mol = u.atoms.convert_to.rdkit(force=True) + for bond in mol.GetBonds(): + if bond.GetBondTypeAsDouble() == 2: + assert bond.GetStereo() != Chem.BondStereo.STEREOANY From 3c9d874fd06eb05a656083c2ceb76714d912e62b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Mar 2022 19:06:05 +0100 Subject: [PATCH 39/47] mention failing molecules in docs --- package/MDAnalysis/converters/RDKit.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index fdaa8d34e91..5bf73a8bd8e 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -277,6 +277,10 @@ class RDKitConverter(base.ConverterBase): * `Video (4:55 to 8:05) `__ * `Slides `__ + Their are some molecules containing specific patterns that the converter + cannot currently tackle correctly. See + `Issue #3339 `__ for + more info. .. versionadded:: 2.0.0 From 1a094faa8af0cbb4638aeac57193fc0ef6a21be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Mar 2022 19:27:59 +0100 Subject: [PATCH 40/47] update changelog --- package/CHANGELOG | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 3e68e15fecc..9f6412b85e0 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -13,13 +13,23 @@ The rules for this file: * release numbers follow "Semantic Versioning" http://semver.org ------------------------------------------------------------------------------ -??/??/?? IAlibay +??/??/?? IAlibay, cbouy * 2.2.0 Fixes + * When converting an AtomGroup to an RDKit molecule (PR #3044): + - the atoms are now in the same order + - the output of `atom.GetMonomerInfo().GetName()` now follows the + guidelines for PDB files while the original name is still available + through `atom.GetProp("_MDAnalysis_name")` + - using `NoImplicit=False` should no longer throw a `SanitizationError` Enhancements + * Improves the RDKitConverter's accuracy (PR #3044): AtomGroups containing + monatomic ion charges or edge cases with nitrogen, sulfur, phosphorus and + conjugated systems should now have correctly assigned bond orders and + charges. Changes From 9b3b7b5972e0d8b90c7f6e62b710cd531bf61e93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Mar 2022 21:15:04 +0100 Subject: [PATCH 41/47] improve readability --- .../MDAnalysisTests/converters/test_rdkit.py | 234 +++++++++--------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 7434ab5e2f6..a49f5767f08 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -101,6 +101,11 @@ def product(request): return getattr(MolFactory, request.param)() +def is_isomorphic(mol, ref, useChirality=False): + return (mol.HasSubstructMatch(ref, useChirality=useChirality) + and ref.HasSubstructMatch(mol, useChirality=useChirality)) + + @requires_rdkit class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ @@ -204,7 +209,7 @@ def test_monomer_info(self, pdb, sel_str, atom_index): def test_identical_topology(self, rdmol): u = mda.Universe(rdmol) umol = u.atoms.convert_to("RDKIT") - assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) + assert is_isomorphic(rdmol, umol) u2 = mda.Universe(umol) assert_equal(u.atoms.bonds, u2.atoms.bonds) assert_equal(u.atoms.elements, u2.atoms.elements) @@ -303,9 +308,7 @@ def test_assign_coordinates(self, pdb): def test_assign_stereochemistry(self, mol2): umol = mol2.atoms.convert_to("RDKIT") rdmol = Chem.MolFromMol2File(mol2_molecule, removeHs=False) - assert rdmol.HasSubstructMatch( - umol, useChirality=True) and umol.HasSubstructMatch( - rdmol, useChirality=True) + assert is_isomorphic(rdmol, umol, useChirality=True) def test_trajectory_coords(self): u = mda.Universe.from_smiles( @@ -395,18 +398,46 @@ def test_sanitize_fail_warning(self): @requires_rdkit class TestRDKitFunctions(object): - def mol_to_template(self, mol): - """Remove bond orders and charges from molecule""" - template = Chem.AddHs(mol) - for atom in template.GetAtoms(): + def add_Hs_remove_bo_and_charges(self, mol): + """Add hydrogens and remove bond orders and charges from a molecule""" + mH = Chem.AddHs(mol) + for atom in mH.GetAtoms(): atom.SetIsAromatic(False) atom.SetFormalCharge(0) atom.SetNoImplicit(True) - for bond in template.GetBonds(): + for bond in mH.GetBonds(): bond.SetIsAromatic(False) bond.SetBondType(Chem.BondType.SINGLE) - template.UpdatePropertyCache(strict=False) - return template + mH.UpdatePropertyCache(strict=False) + return mH + + def enumerate_reordered_mol(self, mol): + """Enumerates all possible starting atoms for a given molecule""" + # go through each possible starting atom + for root_atom in mol.GetAtoms(): + smi = Chem.MolToSmiles(mol, rootedAtAtom=root_atom.GetIdx()) + reordered_mol = Chem.MolFromSmiles(smi, sanitize=False) + for atom in reordered_mol.GetAtoms(): + atom.SetNoImplicit(True) + reordered_mol.UpdatePropertyCache(strict=False) + yield reordered_mol + + def assign_bond_orders_and_charges(self, mol): + """Returns a sanitized molecule with infered bond orders and charges""" + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol) + Chem.SanitizeMol(mol) + return mol + + def assert_mols_equal_or_resonance_structure(self, mol, ref): + """Checks if 2 molecules are equal. If not, enumerates the resonance + structures of the first molecule and checks again""" + isomorphic = is_isomorphic(mol, ref) + if not is_isomorphic: + # try resonance structures + isomorphic = any(is_isomorphic(res_mol, ref) + for res_mol in Chem.ResonanceMolSupplier(mol)) + assert isomorphic @pytest.mark.parametrize("smi, out", [ ("C(-[H])(-[H])(-[H])-[H]", "C"), @@ -431,8 +462,7 @@ def test_infer_bond_orders(self, smi, out): Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( - mol), "{} != {}".format(Chem.MolToSmiles(mol), out) + assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) @pytest.mark.parametrize("smi, atom_idx, charge", [ ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), @@ -469,13 +499,10 @@ def test_infer_charges(self, smi, atom_idx, charge): def test_standardize_patterns(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol) - Chem.SanitizeMol(mol) + mol = self.assign_bond_orders_and_charges(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( - mol), "{} != {}".format(Chem.MolToSmiles(mol), out) + assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) @pytest.mark.parametrize("attr, value, getter", [ ("index", 42, "GetIntProp"), @@ -518,15 +545,14 @@ def test_reassign_index_after_reaction(self, rdmol, product): ]) def test_transfer_properties(self, smi): mol = Chem.MolFromSmiles(smi) - mol = self.mol_to_template(mol) + mol = self.add_Hs_remove_bo_and_charges(mol) old = {} for atom in mol.GetAtoms(): ix = atom.GetIdx() atom.SetIntProp("_MDAnalysis_index", ix) atom.SetProp("dummy", f"foo_{ix}") old[ix] = {"_MDAnalysis_index": ix, "dummy": f"foo_{ix}"} - _infer_bo_and_charges(mol) - newmol = _standardize_patterns(mol) + newmol = self.assign_bond_orders_and_charges(mol) new = {} for a in newmol.GetAtoms(): ix = a.GetIntProp("_MDAnalysis_index") @@ -537,102 +563,78 @@ def test_transfer_properties(self, smi): assert "react_atom_idx" not in props assert new == old - @pytest.mark.parametrize("input_type, input_str", [ - ("smi", "c1ccc(cc1)-c1ccccc1-c1ccccc1"), - ("smi", "c1cc[nH]c1"), - ("smi", "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1"), - ("smi", "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21"), - ("smi", "c1csc(c1)-c1ccoc1-c1cc[nH]c1"), - ("smi", "C1=C2C(=NC=N1)N=CN2"), - ("smi", "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]"), - ("smi", "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1"), - ("smi", "C=CC=CC=CC=CC=CC=C"), - ("smi", "NCCCCC([NH3+])C(=O)[O-]"), - ("smi", "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C"), - ("smi", "C#CC=C"), + @pytest.mark.parametrize("smi", [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", # HID HIE HIP residues, see PR #2941 - ("smi", "O=C([C@H](CC1=CNC=N1)N)O"), - ("smi", "O=C([C@H](CC1=CN=CN1)N)O"), - ("smi", "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]"), + "O=C([C@H](CC1=CNC=N1)N)O", + "O=C([C@H](CC1=CN=CN1)N)O", + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", # fixes from PR #3044 - ("smi", "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1"), - ("smi", "[O-][n+]1cccnc1"), - ("smi", "C[n+]1ccccc1"), - ("smi", "[PH4+]"), - ("smi", "c1nc[nH]n1"), - ("smi", "CC(=O)C=C(C)N"), - ("smi", "CC(C)=CC=C[O-]"), - ("smi", "O=S(C)(C)=NC"), - ("smi", "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1"), - ("smi", "CCCC/C=C/C#CC#CCCCCCCCC(=O)O"), - ("smi", "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1"), - ("smi", "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1"), - ("smi", "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N"), - ("smi", "[O-][n+]1onc2ccccc21"), - ("smi", "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1"), - ("smi", "[O-]c1ccccc1"), - ("smi", "[O-]C=CC=CCC=CC=[N+](C)C"), - # test amino acids - ("aa", "A"), - ("aa", "G"), - ("aa", "I"), - ("aa", "L"), - ("aa", "P"), - ("aa", "V"), - ("aa", "F"), - ("aa", "W"), - ("aa", "Y"), - ("aa", "D"), - ("aa", "E"), - ("aa", "R"), - ("aa", "H"), - ("aa", "K"), - ("aa", "S"), - ("aa", "T"), - ("aa", "C"), - ("aa", "M"), - ("aa", "N"), - ("aa", "Q"), - # test nucleic acids - ("na", "A"), - ("na", "T"), - ("na", "U"), - ("na", "C"), - ("na", "G"), + "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", + "[O-][n+]1cccnc1", + "C[n+]1ccccc1", + "[PH4+]", + "c1nc[nH]n1", + "CC(=O)C=C(C)N", + "CC(C)=CC=C[O-]", + "O=S(C)(C)=NC", + "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", + "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", + "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", + "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", + "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", + "[O-][n+]1onc2ccccc21", + "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1", + "[O-]c1ccccc1", + "[O-]C=CC=CCC=CC=[N+](C)C", + # amino acids + "C[C@H](N)C(=O)O", # A + "NCC(=O)O", # G + "CC[C@H](C)[C@H](N)C(=O)O", # I + "CC(C)C[C@H](N)C(=O)O", # L + "O=C(O)[C@@H]1CCCN1", # P + "CC(C)[C@H](N)C(=O)O", # V + "N[C@@H](Cc1ccccc1)C(=O)O", # F + "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W + "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y + "N[C@@H](CC(=O)O)C(=O)O", # D + "N[C@@H](CCC(=O)O)C(=O)O", # E + "N=C(N)NCCC[C@H](N)C(=O)O", # R + "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H + "NCCCC[C@H](N)C(=O)O", # K + "N[C@@H](CO)C(=O)O", # S + "C[C@@H](O)[C@H](N)C(=O)O", # T + "N[C@@H](CS)C(=O)O", # C + "CSCC[C@H](N)C(=O)O", # M + "NC(=O)C[C@H](N)C(=O)O", # N + "NC(=O)CC[C@H](N)C(=O)O", # Q + # nucleic acids + "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A + "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T + "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U + "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C + "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G ]) - def test_order_independant(self, input_type, input_str): + def test_order_independant(self, smi): # generate mol with hydrogens but without bond orders - if input_type == "smi": - ref = Chem.MolFromSmiles(input_str) - elif input_type == "na": - ref = Chem.MolFromSequence(input_str, flavor=9) - else: - ref = Chem.MolFromSequence(input_str) - template = self.mol_to_template(ref) - # go through each possible starting atom - for a in template.GetAtoms(): - smi = Chem.MolToSmiles(template, rootedAtAtom=a.GetIdx()) - m = Chem.MolFromSmiles(smi, sanitize=False) - for atom in m.GetAtoms(): - atom.SetFormalCharge(0) - atom.SetIsAromatic(False) - atom.SetNoImplicit(True) - m.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(m) - m = _standardize_patterns(m) - Chem.SanitizeMol(m) + ref = Chem.MolFromSmiles(smi) + stripped_mol = self.add_Hs_remove_bo_and_charges(ref) + # enumerate mols with different starting atoms + for m in self.enumerate_reordered_mol(stripped_mol): + m = self.assign_bond_orders_and_charges(m) m = Chem.RemoveHs(m) - match = m.HasSubstructMatch(ref) and ref.HasSubstructMatch(m) - if not match: - # try resonance structures for charged conjugated systems - for mol in Chem.ResonanceMolSupplier(m, maxStructs=20): - match = (mol.HasSubstructMatch(ref) and - ref.HasSubstructMatch(mol)) - if match: - break - assert match, (f"(input) {Chem.MolToSmiles(ref)} != " - f"{Chem.MolToSmiles(m)} (output) " - f"root atom {a.GetIdx()}") + self.assert_mols_equal_or_resonance_structure(m, ref) def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" @@ -651,11 +653,9 @@ def test_warn_conjugated_max_iter(self): ]) def test_ions(self, smi): ref = Chem.MolFromSmiles(smi) - mol = self.mol_to_template(ref) - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol) - Chem.SanitizeMol(mol) - assert mol.HasSubstructMatch(ref) and ref.HasSubstructMatch(mol) + stripped_mol = self.add_Hs_remove_bo_and_charges(ref) + mol = self.assign_bond_orders_and_charges(stripped_mol) + assert is_isomorphic(mol, ref) @pytest.mark.parametrize("smi", [ "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", From b18150f5720569fb0a4ffc8307ed14d78f8728ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 16 Mar 2022 22:00:35 +0100 Subject: [PATCH 42/47] improve tests readability --- .../MDAnalysisTests/converters/test_rdkit.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index a49f5767f08..a039e021bba 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -192,17 +192,19 @@ def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): ]) def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) + mda_atom = sel.atoms[atom_index] umol = sel.convert_to.rdkit(NoImplicit=False) atom = umol.GetAtomWithIdx(atom_index) mi = atom.GetMonomerInfo() - - for mda_attr, rd_attr in RDATTRIBUTES.items(): - rd_value = getattr(mi, "Get%s" % rd_attr)() - if hasattr(sel, mda_attr): - mda_value = getattr(sel, mda_attr)[atom_index] - if mda_attr == "names": - rd_value = rd_value.strip() - assert rd_value == mda_value + assert mda_atom.altLoc == mi.GetAltLoc() + assert mda_atom.chainID == mi.GetChainId() + assert mda_atom.icode == mi.GetInsertionCode() + assert mda_atom.name == mi.GetName().strip() + assert mda_atom.occupancy == mi.GetOccupancy() + assert mda_atom.resname == mi.GetResidueName() + assert mda_atom.resid == mi.GetResidueNumber() + assert mda_atom.segindex == mi.GetSegmentNumber() + assert mda_atom.tempfactor == mi.GetTempFactor() @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], indirect=True) @@ -283,11 +285,10 @@ def test_add_mda_attr_to_rdkit(self, attr, value, expected): def test_other_attributes(self, mol2, idx): mol = mol2.atoms.convert_to("RDKIT") rdatom = mol.GetAtomWithIdx(idx) - rdprops = rdatom.GetPropsAsDict() - for prop in ["charge", "segid", "type"]: - rdprop = rdprops["_MDAnalysis_%s" % prop] - mdaprop = getattr(mol2.atoms[idx], prop) - assert rdprop == mdaprop + mda_atom = mol2.atoms[idx] + assert mda_atom.charge == rdatom.GetDoubleProp("_MDAnalysis_charge") + assert mda_atom.segid == rdatom.GetProp("_MDAnalysis_segid") + assert mda_atom.type == rdatom.GetProp("_MDAnalysis_type") @pytest.mark.parametrize("sel_str", [ "resname ALA", From e01fc07e704449688f357250684ce8f07b4cfb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 17 Mar 2022 20:59:41 +0100 Subject: [PATCH 43/47] fix resonance struct test --- .../MDAnalysisTests/converters/test_rdkit.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index a039e021bba..b799c8ffb83 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -430,15 +430,15 @@ def assign_bond_orders_and_charges(self, mol): Chem.SanitizeMol(mol) return mol - def assert_mols_equal_or_resonance_structure(self, mol, ref): - """Checks if 2 molecules are equal. If not, enumerates the resonance - structures of the first molecule and checks again""" - isomorphic = is_isomorphic(mol, ref) - if not is_isomorphic: - # try resonance structures - isomorphic = any(is_isomorphic(res_mol, ref) - for res_mol in Chem.ResonanceMolSupplier(mol)) - assert isomorphic + def assert_isomorphic_resonance_structure(self, mol, ref): + """Checks if 2 molecules are isomorphic using their resonance + structures + """ + isomorphic = mol.HasSubstructMatch(ref) + if not isomorphic: + isomorphic = bool(Chem.ResonanceMolSupplier(mol) + .GetSubstructMatch(ref)) + assert isomorphic, f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" @pytest.mark.parametrize("smi, out", [ ("C(-[H])(-[H])(-[H])-[H]", "C"), @@ -635,7 +635,8 @@ def test_order_independant(self, smi): for m in self.enumerate_reordered_mol(stripped_mol): m = self.assign_bond_orders_and_charges(m) m = Chem.RemoveHs(m) - self.assert_mols_equal_or_resonance_structure(m, ref) + self.assert_isomorphic_resonance_structure(m, ref) + def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" From 4e4a40bc70a0c3b1175926a63dc06e9fe5a6ef3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 17 Mar 2022 20:59:55 +0100 Subject: [PATCH 44/47] add xfail test --- testsuite/MDAnalysisTests/converters/test_rdkit.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index b799c8ffb83..3c57c4dce6d 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -599,6 +599,8 @@ def test_transfer_properties(self, smi): "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1", "[O-]c1ccccc1", "[O-]C=CC=CCC=CC=[N+](C)C", + "C=[N+](-[O-])-C", + "C-[N-]-C(=O)C", # amino acids "C[C@H](N)C(=O)O", # A "NCC(=O)O", # G @@ -637,6 +639,15 @@ def test_order_independant(self, smi): m = Chem.RemoveHs(m) self.assert_isomorphic_resonance_structure(m, ref) + @pytest.mark.xfail(reason="Not currently tackled by the RDKitConverter") + @pytest.mark.parametrize("smi", [ + "C-[N+]#N", + "C-N=[N+]=[N-]", + "C-[O+]=C", + "C-[N+]#[C-]", + ]) + def test_order_independant_issue_3339(self, smi): + self.test_order_independant(smi) def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" From 5a48bd693ecef9d1400b13f4feb5494e7afab179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 30 Mar 2022 14:52:58 +0200 Subject: [PATCH 45/47] lint --- testsuite/MDAnalysisTests/converters/test_rdkit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 3c57c4dce6d..24a6066b4eb 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -411,7 +411,7 @@ def add_Hs_remove_bo_and_charges(self, mol): bond.SetBondType(Chem.BondType.SINGLE) mH.UpdatePropertyCache(strict=False) return mH - + def enumerate_reordered_mol(self, mol): """Enumerates all possible starting atoms for a given molecule""" # go through each possible starting atom @@ -422,7 +422,7 @@ def enumerate_reordered_mol(self, mol): atom.SetNoImplicit(True) reordered_mol.UpdatePropertyCache(strict=False) yield reordered_mol - + def assign_bond_orders_and_charges(self, mol): """Returns a sanitized molecule with infered bond orders and charges""" _infer_bo_and_charges(mol) @@ -692,7 +692,7 @@ def test_pdb_names(self): mol = u.atoms.convert_to.rdkit() names = u.atoms.names rd_names = np.array([a.GetProp("_MDAnalysis_name") - for a in mol.GetAtoms()]) + for a in mol.GetAtoms()]) assert (names == rd_names).all() @pytest.mark.parametrize("smi", [ From 430f3e6a78442cc6555945f0ce846048db315f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 30 Mar 2022 14:53:48 +0200 Subject: [PATCH 46/47] docs update from code review --- package/MDAnalysis/converters/RDKit.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 5bf73a8bd8e..ed209362c94 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -50,10 +50,11 @@ .. warning:: The RDKit converter is currently *experimental* and may not work as expected for all molecules. Currently the converter accurately - infers the structures of approximately 90% of the `ChEMBL27`_ dataset. + infers the structures of approximately 99% of the `ChEMBL27`_ dataset. Work is currently ongoing on further improving this and updates to the converter are expected in future releases of MDAnalysis. - Please see `Pull Request #3044`_ and `Issue #3339`_ for more details. + Please see `Issue #3339`_ and the `RDKitConverter benchmark`_ for more + details. @@ -76,9 +77,8 @@ .. Links .. _`ChEMBL27`: https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_27/ -.. _`Pull Request #3044`: https://github.com/MDAnalysis/mdanalysis/pull/3044 .. _`Issue #3339`: https://github.com/MDAnalysis/mdanalysis/issues/3339 - +.. _`RDKitConverter benchmark`: https://github.com/MDAnalysis/RDKitConverter-benchmark """ import warnings @@ -285,13 +285,11 @@ class RDKitConverter(base.ConverterBase): .. versionadded:: 2.0.0 .. versionchanged:: 2.2.0 - See the full changelog for more details: - - - Improved the accuracy of the converter - - Atoms in the resulting molecule follow the same order as in the - AtomGroup - - Fixed a ``SanitizationError`` when disabling the bond order inferring - step + Improved the accuracy of the converter. Atoms in the resulting molecule + now follow the same order as in the AtomGroup. The output of + `atom.GetMonomerInfo().GetName()` now follows the guidelines for PDB + files while the original name is still available through + `atom.GetProp("_MDAnalysis_name")` """ @@ -305,8 +303,7 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, Parameters ----------- - obj : :class:`~MDAnalysis.core.groups.AtomGroup` or - :class:`~MDAnalysis.core.universe.Universe` + obj : `AtomGroup` or `Universe` cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the From f019d28350ef8adb9ba778b20dc1200e3fc4b426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 30 Mar 2022 23:49:31 +0200 Subject: [PATCH 47/47] fix docs --- package/MDAnalysis/converters/RDKit.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index ed209362c94..782cd82f238 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -277,7 +277,7 @@ class RDKitConverter(base.ConverterBase): * `Video (4:55 to 8:05) `__ * `Slides `__ - Their are some molecules containing specific patterns that the converter + There are some molecules containing specific patterns that the converter cannot currently tackle correctly. See `Issue #3339 `__ for more info. @@ -287,9 +287,9 @@ class RDKitConverter(base.ConverterBase): .. versionchanged:: 2.2.0 Improved the accuracy of the converter. Atoms in the resulting molecule now follow the same order as in the AtomGroup. The output of - `atom.GetMonomerInfo().GetName()` now follows the guidelines for PDB + ``atom.GetMonomerInfo().GetName()`` now follows the guidelines for PDB files while the original name is still available through - `atom.GetProp("_MDAnalysis_name")` + ``atom.GetProp("_MDAnalysis_name")`` """ @@ -303,7 +303,7 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, Parameters ----------- - obj : `AtomGroup` or `Universe` + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the