diff --git a/.github/actions/setup-deps/action.yaml b/.github/actions/setup-deps/action.yaml index e5a0794450c..a5163a3170a 100644 --- a/.github/actions/setup-deps/action.yaml +++ b/.github/actions/setup-deps/action.yaml @@ -79,7 +79,7 @@ inputs: pytng: default: 'pytng>=0.2.3' rdkit: - default: 'rdkit>=2020.03.1' + default: 'rdkit>=2022.09.1' scikit-learn: default: 'scikit-learn' seaborn: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 41424cf50de..a41aa2dae70 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -113,7 +113,7 @@ jobs: parmed pytng>=0.2.3 tidynamics>=1.0.0 - rdkit>=2020.03.1 + rdkit>=2022.09.1 displayName: 'Install additional dependencies for 64-bit tests' condition: and(succeeded(), eq(variables['PYTHON_ARCH'], 'x64')) - script: >- diff --git a/package/CHANGELOG b/package/CHANGELOG index 51ed2b16c4e..90b144f5477 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -17,8 +17,8 @@ The rules for this file: ??/??/?? IAlibay, HeetVekariya, marinegor, lilyminium, RMeli, ljwoods2, aditya292002, pstaerk, PicoCentauri, BFedder, tyler.je.reddy, SampurnaM, leonwehrhan, kainszs, orionarcher, - yuxuanzhuang - + yuxuanzhuang, cbouy + * 2.8.0 Fixes @@ -45,6 +45,8 @@ Fixes * Fix groups.py doctests using sphinx directives (Issue #3925, PR #4374) Enhancements + * Added `TemplateInferer` and `RDKitInferer` dataclasses to the + `RDKitInferring` module to be used by the RDKit converter. (PR #4305) * Added a tqdm progress bar for `MDAnalysis.analysis.pca.PCA.transform()` (PR #4531) * Improved performance of PDBWriter (Issue #2785, PR #4472) @@ -55,6 +57,9 @@ Enhancements DOI 10.1021/acs.jpcb.7b11988. (Issue #2039, PR #4524) Changes + * Refactored the RDKit converter code to move the inferring code in a separate + `RDKitInferring` module. The bond order and charges inferer has been move to + an `MDAnalysisInferer` dataclass in there. (PR #4305) * As per SPEC0 the minimum supported Python version has been raised to 3.10 (PR #4502) * MDAnalysis.analysis.hole2 is now directly imported from the mdakit @@ -72,6 +77,8 @@ Changes numpy.testing.assert_allclose #4438) Deprecations + * The RDKit converter parameter `NoImplicit` has been deprecated in favour of + `implicit_hydrogens` and `inferer` parameters. (PR #4305) * The MDAnalysis.analysis.waterdynamics module has been deprecated in favour of the waterdynamics MDAKit and will be removed in version 3.0.0 (PR #4404) * The MDAnalysis.analysis.psa module has been deprecated in favour of diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 139528440ab..bffdbe3a568 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -24,7 +24,8 @@ """RDKit molecule I/O --- :mod:`MDAnalysis.converters.RDKit` ================================================================ -Read coordinates data from an `RDKit `__ :class:`rdkit.Chem.rdchem.Mol` with +Read coordinates data from an `RDKit `__ +:class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` into an MDAnalysis Universe. Convert it back to a :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. @@ -67,12 +68,6 @@ .. autoclass:: RDKitConverter :members: -.. autofunction:: _infer_bo_and_charges - -.. autofunction:: _standardize_patterns - -.. autofunction:: _rebuild_conjugated_bonds - .. Links @@ -83,32 +78,24 @@ import copy import warnings +from contextlib import suppress from functools import lru_cache from io import StringIO import numpy as np -from . import base from ..coordinates import memory from ..coordinates.PDB import PDBWriter from ..core.topologyattrs import _TOPOLOGY_ATTRS from ..exceptions import NoDataError +from . import base -try: +DEFAULT_INFERER = None +with suppress(ImportError): from rdkit import Chem - from rdkit.Chem import AllChem -except ImportError: - pass -else: - RDBONDORDER = { - 1: Chem.BondType.SINGLE, - 1.5: Chem.BondType.AROMATIC, - "ar": Chem.BondType.AROMATIC, - 2: Chem.BondType.DOUBLE, - 3: Chem.BondType.TRIPLE, - } - # add string version of the key for each bond - RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) + + from .RDKitInferring import RDBONDORDER, MDAnalysisInferer + RDATTRIBUTES = { "altLocs": "AltLoc", "chainIDs": "ChainId", @@ -120,31 +107,11 @@ "segindices": "SegmentNumber", "tempfactors": "TempFactor", } - PERIODIC_TABLE = Chem.GetPeriodicTable() - - -# charges that should be assigned to monatomic cations -# structure --> atomic number : formal charge -# anion charges are directly handled by the code using the typical valence -# of the atom -MONATOMIC_CATION_CHARGES = { - 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, - 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, - 26: 2, # Fe could also be +3 - 13: 3, -} -# reactions uses by _standardize_patterns to fix challenging cases -# must have single reactant and product, and cannot add any atom -STANDARDIZATION_REACTIONS = [ - "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm - "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm - "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate - "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG - "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" - ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP - "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone - "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N -] + DEFAULT_INFERER = MDAnalysisInferer() + # only here for backwards compatibility + _infer_bo_and_charges = DEFAULT_INFERER._infer_bo_and_charges + _standardize_patterns = DEFAULT_INFERER._standardize_patterns + _deduce_PDB_atom_name = PDBWriter(StringIO())._deduce_PDB_atom_name @@ -153,10 +120,11 @@ class RDKitReader(memory.MemoryReader): .. versionadded:: 2.0.0 """ - format = 'RDKIT' + + format = "RDKIT" # Structure.coordinates always in Angstrom - units = {'time': None, 'length': 'Angstrom'} + units = {"time": None, "length": "Angstrom"} @staticmethod def _format_hint(thing): @@ -180,14 +148,15 @@ def __init__(self, filename, **kwargs): RDKit molecule """ n_atoms = filename.GetNumAtoms() - coordinates = np.array([ - conf.GetPositions() for conf in filename.GetConformers()], - dtype=np.float32) + coordinates = np.array( + [conf.GetPositions() for conf in filename.GetConformers()], + dtype=np.float32, + ) if coordinates.size == 0: warnings.warn("No coordinates found in the RDKit molecule") coordinates = np.empty((1, n_atoms, 3), dtype=np.float32) coordinates[:] = np.nan - super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) + super(RDKitReader, self).__init__(coordinates, order="fac", **kwargs) class RDKitConverter(base.ConverterBase): @@ -275,8 +244,21 @@ class RDKitConverter(base.ConverterBase): guessed if not present. Hydrogens should be explicit in the topology file. If this is not the case, - use the parameter ``NoImplicit=False`` when using the converter to allow - implicit hydrogens and disable inferring bond orders and charges. + use the parameter ``implicit_hydrogens=True`` when using the converter to + allow implicit hydrogens, and ``inferer=None`` to disable inferring bond + orders and charges. You can also pass your own callable function to + ``inferer`` to assign bond orders and charges as you see fit:: + + >>> from rdkit import Chem + >>> from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate + >>> template = Chem.MolFromSmiles("NC(Cc1ccccc1)C(=O)O") + >>> def infer_from_template(mol: Chem.Mol) -> Chem.Mol: + ... return AssignBondOrdersFromTemplate(template, mol) + >>> mol = u.atoms.convert_to.rdkit(inferer=infer_from_template) + + + Note that all builtin inferer functions can be found in the + :mod:`RDKitInferring` module. Since one of the main use case of the converter is converting trajectories and not just a topology, creating a new molecule from scratch for every @@ -285,8 +267,8 @@ class RDKitConverter(base.ConverterBase): sensitive to the arguments that were passed to the converter. The number of objects cached can be changed with the function :func:`set_converter_cache_size`. However, ``ag.convert_to("RDKIT")`` - followed by ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the - cache since the arguments given are different. You can pass a + followed by ``ag.convert_to("RDKIT", implicit_hydrogens=False)`` will not + use the cache since the arguments given are different. You can pass a ``cache=False`` argument to the converter to bypass the caching system. The ``_MDAnalysis_index`` property of the resulting molecule corresponds @@ -313,61 +295,104 @@ class RDKitConverter(base.ConverterBase): files while the original name is still available through ``atom.GetProp("_MDAnalysis_name")`` - """ + .. versionchanged:: 2.8.0 + Deprecated ``max_iter`` (moved to the inferer class + :class:`~MDAnalysis.converters.RDKitInferring.MDAnalysisInferer`) and + deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added + ``inferer`` to specify a callable that can transform the molecule (this + operation is cached). - lib = 'RDKIT' - units = {'time': None, 'length': 'Angstrom'} + """ - def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, - force=False): + lib = "RDKIT" + units = {"time": None, "length": "Angstrom"} + + def convert( + self, + obj, + cache=True, + implicit_hydrogens=False, + force=False, + inferer=DEFAULT_INFERER, + **kwargs, + ): """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. Parameters ----------- - obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or + :class:`~MDAnalysis.core.universe.Universe` cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the same AtomGroup selection and with the same arguments passed to the converter - NoImplicit : bool - Prevent adding hydrogens to the molecule - max_iter : int - Maximum number of iterations to standardize conjugated systems. - See :func:`_rebuild_conjugated_bonds` + inferer : Optional[Callable[[Chem.Mol], Chem.Mol]] + A callable to infer bond orders and charges for the RDKit molecule + created by the converter. If ``None``, inferring is skipped. + implicit_hydrogens : bool + Whether to allow implicit hydrogens on the molecule or not. force : bool Force the conversion when no hydrogens were detected but - ``NoImplicit=True``. Useful for inorganic molecules mostly. + ``inferer`` is not ``None``. Useful for inorganic molecules mostly. """ try: from rdkit import Chem except ImportError: - raise ImportError("RDKit is required for the RDKitConverter but " - "it's not installed. Try installing it with \n" - "conda install -c conda-forge rdkit") + raise ImportError( + "RDKit is required for the RDKitConverter but " + "it's not installed. Try installing it with \n" + "`conda install -c conda-forge rdkit` or `pip install rdkit`" + ) from None try: # make sure to use atoms (Issue 46) ag = obj.atoms except AttributeError: - raise TypeError("No `atoms` attribute in object of type {}, " - "please use a valid AtomGroup or Universe".format( - type(obj))) from None + raise TypeError( + f"No `atoms` attribute in object of type {type(obj)!r}, " + "please use a valid AtomGroup or Universe" + ) from None + + if (max_iter := kwargs.get("max_iter")) is not None: + warnings.warn( + "Using `max_iter` is deprecated, use `MDAnalysisInferer" + "(max_iter=...)` instead", + DeprecationWarning, + ) + if isinstance(inferer, MDAnalysisInferer): + inferer = MDAnalysisInferer(max_iter=max_iter) + + if (NoImplicit := kwargs.get("NoImplicit")) is not None: + warnings.warn( + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` " + " instead. To disable bond order and formal charge inferring, " + "use `inferer=None`", + DeprecationWarning, + ) + implicit_hydrogens = not NoImplicit + # backwards compatibility + if implicit_hydrogens: + inferer = None # parameters passed to atomgroup_to_mol - kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter, force=force) + params = dict( + implicit_hydrogens=implicit_hydrogens, force=force, inferer=inferer + ) if cache: - mol = atomgroup_to_mol(ag, **kwargs) + mol = atomgroup_to_mol(ag, **params) mol = copy.deepcopy(mol) else: - mol = atomgroup_to_mol.__wrapped__(ag, **kwargs) + mol = atomgroup_to_mol.__wrapped__(ag, **params) # add a conformer for the current Timestep if hasattr(ag, "positions"): if np.isnan(ag.positions).any(): - warnings.warn("NaN detected in coordinates, the output " - "molecule will not have 3D coordinates assigned") + warnings.warn( + "NaN detected in coordinates, the output " + "molecule will not have 3D coordinates assigned" + ) else: # assign coordinates conf = Chem.Conformer(mol.GetNumAtoms()) @@ -384,22 +409,33 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, @lru_cache(maxsize=2) -def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): +def atomgroup_to_mol( + ag, + implicit_hydrogens=False, + force=False, + inferer=DEFAULT_INFERER, + **kwargs, +): """Converts an AtomGroup to an RDKit molecule without coordinates. Parameters ----------- ag : MDAnalysis.core.groups.AtomGroup The AtomGroup to convert - NoImplicit : bool - Prevent adding hydrogens to the molecule and allow bond orders and - formal charges to be guessed from the valence of each atom. - max_iter : int - Maximum number of iterations to standardize conjugated systems. - See :func:`_rebuild_conjugated_bonds` + implicit_hydrogens : bool + Whether to allow implicit hydrogens on the molecule or not. force : bool - Force the conversion when no hydrogens were detected but - ``NoImplicit=True``. Mostly useful for inorganic molecules. + Force the conversion when no hydrogens were detected but ``inferer`` + is not ``None``. Useful for inorganic molecules mostly. + inferer : Optional[Callable[[rdkit.Chem.rdchem.Mol], rdkit.Chem.rdchem.Mol]] + A callable to infer bond orders and charges for the RDKit molecule + created by the converter. If ``None``, inferring is skipped. + + + .. versionchanged:: 2.8.0 + Deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added + ``inferer`` to specify a callable that can transform the molecule (this + operation is cached). """ try: elements = ag.elements @@ -408,7 +444,8 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): "The `elements` attribute is required for the RDKitConverter " "but is not present in this AtomGroup. Please refer to the " "documentation to guess elements from other attributes or " - "type `help(MDAnalysis.topology.guessers)`") from None + "type `help(MDAnalysis.topology.guessers)`" + ) from None if "H" not in ag.elements: if force: @@ -416,25 +453,42 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): "No hydrogen atom found in the topology. " "Forcing to continue the conversion." ) - elif NoImplicit: + elif inferer is not None: raise AttributeError( "No hydrogen atom could be found in the topology, but the " "converter requires all hydrogens to be explicit. You can use " - "the parameter ``NoImplicit=False`` when using the converter " - "to allow implicit hydrogens and disable inferring bond " - "orders and charges. You can also use ``force=True`` to " - "ignore this error.") + "the parameter ``inferer=None`` when using the converter " + "to disable inferring bond orders and charges. You can also " + "use ``force=True`` to ignore this error." + ) + + if (NoImplicit := kwargs.pop("NoImplicit", None)) is not None: + warnings.warn( + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` " + "instead. To disable bond order and formal charge inferring, use " + "`inferer=None`", + DeprecationWarning, + ) + implicit_hydrogens = not NoImplicit + # backwards compatibility + if implicit_hydrogens: + inferer = None + if kwargs: + raise ValueError(f"Found unexpected arguments: {kwargs}.") # attributes accepted in PDBResidueInfo object pdb_attrs = {} - for attr in RDATTRIBUTES.keys(): + for attr in RDATTRIBUTES: if hasattr(ag, attr): pdb_attrs[attr] = getattr(ag, attr) resnames = pdb_attrs.get("resnames", None) if resnames is None: + def get_resname(idx): return "" + else: + def get_resname(idx): return resnames[idx] @@ -444,6 +498,7 @@ def get_resname(idx): other_attrs[attr] = getattr(ag, attr) mol = Chem.RWMol() + NoImplicit = not implicit_hydrogens # map index in universe to index in mol atom_mapper = {} @@ -458,10 +513,9 @@ def get_resname(idx): _add_mda_attr_to_rdkit(attr, values[i], mi, get_resname(i)) rdatom.SetMonomerInfo(mi) # other properties - for attr in other_attrs.keys(): - value = other_attrs[attr][i] - attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular - _set_atom_property(rdatom, attr, value) + for attr, values in other_attrs.items(): + attr = f"_MDAnalysis_{_TOPOLOGY_ATTRS[attr].singular}" + _set_atom_property(rdatom, attr, values[i]) _set_atom_property(rdatom, "_MDAnalysis_index", i) # add atom index = mol.AddAtom(rdatom) @@ -472,7 +526,8 @@ def get_resname(idx): except NoDataError: warnings.warn( "No `bonds` attribute in this AtomGroup. Guessing bonds based " - "on atoms coordinates") + "on atoms coordinates" + ) ag.guess_bonds() for bond in ag.bonds: @@ -485,21 +540,9 @@ def get_resname(idx): mol.UpdatePropertyCache(strict=False) - if NoImplicit: - # infer bond orders and formal charges from the connectivity - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol, max_iter) - # reorder atoms to match MDAnalysis, since the reactions from - # _standardize_patterns will mess up the original order - order = np.argsort([atom.GetIntProp("_MDAnalysis_index") - for atom in mol.GetAtoms()]) - mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) - - # sanitize if possible - err = Chem.SanitizeMol(mol, catchErrors=True) - if err: - warnings.warn("Could not sanitize molecule: " - f"failed during step {err!r}") + if inferer is not None: + # infer bond orders and formal charges + mol = inferer(mol) return mol @@ -514,7 +557,7 @@ def set_converter_cache_size(maxsize): conversions in memory. Using ``maxsize=None`` will remove all limits to the cache size, i.e. everything is cached. """ - global atomgroup_to_mol # pylint: disable=global-statement + global atomgroup_to_mol # pylint: disable=global-statement atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__) @@ -542,7 +585,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi, resname=""): # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] - getattr(mi, "Set%s" % rdattr)(value) + getattr(mi, f"Set{rdattr}")(value) def _set_str_prop(atom, attr, value): @@ -589,391 +632,3 @@ def _ignore_prop(atom, attr, value): def _set_atom_property(atom, attr, value): """Saves any attribute and value into an RDKit atom property""" _atom_property_dispatcher.get(type(value), _ignore_prop)(atom, attr, value) - - -def _atom_sorter(atom): - """Sorts atoms in the molecule in a way that makes it easy for the bond - order and charge infering code to get the correct state on the first - try. Currently sorts by number of unpaired electrons, then by number of - heavy atom neighbors (i.e. atoms at the edge first).""" - num_heavy_neighbors = len([ - neighbor for neighbor in atom.GetNeighbors() - if neighbor.GetAtomicNum() > 1] - ) - return (-_get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) - - -def _infer_bo_and_charges(mol): - """Infer bond orders and formal charges from a molecule. - - Since most MD topology files don't explicitly retain information on bond - orders or charges, it has to be guessed from the topology. This is done by - looping over each atom and comparing its expected valence to the current - valence to get the Number of Unpaired Electrons (NUE). - If an atom has a negative NUE, it needs a positive formal charge (-NUE). - If two neighbouring atoms have UEs, the bond between them most - likely has to be increased by the value of the smallest NUE. - If after this process, an atom still has UEs, it needs a negative formal - charge of -NUE. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule is modified inplace and must have all hydrogens added - - Notes - ----- - This algorithm is order dependant. For example, for a carboxylate group - R-C(-O)-O the first oxygen read will receive a double bond and the other - one will be charged. It will also affect more complex conjugated systems. - """ - # heavy atoms sorted by number of heavy atom neighbors (lower first) then - # NUE (higher first) - atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], - key=_atom_sorter) - - for atom in atoms: - # monatomic ions - if atom.GetDegree() == 0: - atom.SetFormalCharge(MONATOMIC_CATION_CHARGES.get( - atom.GetAtomicNum(), - -_get_nb_unpaired_electrons(atom)[0])) - mol.UpdatePropertyCache(strict=False) - continue - # get NUE for each possible valence - nue = _get_nb_unpaired_electrons(atom) - # if there's only one possible valence state and the corresponding - # NUE is negative, it means we can only add a positive charge to - # the atom - if (len(nue) == 1) and (nue[0] < 0): - atom.SetFormalCharge(-nue[0]) - mol.UpdatePropertyCache(strict=False) - # go to next atom if above case or atom has no unpaired electron - if (len(nue) == 1) and (nue[0] <= 0): - continue - else: - neighbors = sorted(atom.GetNeighbors(), reverse=True, - key=lambda a: _get_nb_unpaired_electrons(a)[0]) - # check if one of the neighbors has a common NUE - for na in neighbors: - # get NUE for the neighbor - na_nue = _get_nb_unpaired_electrons(na) - # smallest common NUE - common_nue = min( - min([i for i in nue if i >= 0], default=0), - min([i for i in na_nue if i >= 0], default=0) - ) - # a common NUE of 0 means we don't need to do anything - if common_nue != 0: - # increase bond order - bond = mol.GetBondBetweenAtoms( - atom.GetIdx(), na.GetIdx()) - order = common_nue + 1 - bond.SetBondType(RDBONDORDER[order]) - mol.UpdatePropertyCache(strict=False) - # go to next atom if one of the valences is complete - nue = _get_nb_unpaired_electrons(atom) - if any([n == 0 for n in nue]): - break - - # if atom valence is still not filled - nue = _get_nb_unpaired_electrons(atom) - if not any([n == 0 for n in nue]): - # transform nue to charge - atom.SetFormalCharge(-nue[0]) - atom.SetNumRadicalElectrons(0) - mol.UpdatePropertyCache(strict=False) - - -def _get_nb_unpaired_electrons(atom): - """Calculate the number of unpaired electrons (NUE) of an atom - - Parameters - ---------- - atom: rdkit.Chem.rdchem.Atom - The atom for which the NUE will be computed - - Returns - ------- - nue : list - The NUE for each possible valence of the atom - """ - expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) - current_v = atom.GetTotalValence() - atom.GetFormalCharge() - return [v - current_v for v in expected_vs] - - -def _standardize_patterns(mol, max_iter=200): - """Standardizes functional groups - - Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, - and SMARTS reactions for other functional groups. - Due to the way reactions work, we first have to split the molecule by - fragments. Then, for each fragment, we apply the standardization reactions. - Finally, the fragments are recombined. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to standardize - max_iter : int - Maximum number of iterations to standardize conjugated systems - - Returns - ------- - mol : rdkit.Chem.rdchem.Mol - The standardized molecule - - Notes - ----- - The following functional groups are transformed in this order: - - +---------------+------------------------------------------------------------------------------+ - | Name | Reaction | - +===============+==============================================================================+ - | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | - +---------------+------------------------------------------------------------------------------+ - | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | - +---------------+------------------------------------------------------------------------------+ - | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | - +---------------+------------------------------------------------------------------------------+ - | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | - +---------------+------------------------------------------------------------------------------+ - | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | - +---------------+------------------------------------------------------------------------------+ - | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | - +---------------+------------------------------------------------------------------------------+ - | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | - +---------------+------------------------------------------------------------------------------+ - | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | - +---------------+------------------------------------------------------------------------------+ - | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| - +---------------+------------------------------------------------------------------------------+ - | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | - +---------------+------------------------------------------------------------------------------+ - | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | - +---------------+------------------------------------------------------------------------------+ - - """ - - # standardize conjugated systems - _rebuild_conjugated_bonds(mol, max_iter) - - # list of sanitized reactions - reactions = [] - for rxn in STANDARDIZATION_REACTIONS: - reaction = AllChem.ReactionFromSmarts(rxn) - reactions.append(reaction) - - # fragment mol (reactions must have single reactant and product) - fragments = list(Chem.GetMolFrags(mol, asMols=True)) - for reactant in fragments: - _apply_reactions(reactions, reactant) - - # recombine fragments - newmol = fragments.pop(0) - for fragment in fragments: - newmol = Chem.CombineMols(newmol, fragment) - - return newmol - - -def _apply_reactions(reactions, reactant): - """Applies a series of unimolecular reactions to a molecule. The reactions - must not add any atom to the product. The molecule is modified inplace. - - Parameters - ---------- - reactions : List[rdkit.Chem.rdChemReactions.ChemicalReaction] - Reactions from SMARTS. Each reaction is applied until no more - transformations can be made. - reactant : rdkit.Chem.rdchem.Mol - The molecule to transform inplace - - """ - reactant.UpdatePropertyCache(strict=False) - Chem.Kekulize(reactant) - for reaction in reactions: - while reaction.RunReactantInPlace(reactant): - reactant.UpdatePropertyCache(strict=False) - reactant.UpdatePropertyCache(strict=False) - Chem.Kekulize(reactant) - - -def _rebuild_conjugated_bonds(mol, max_iter=200): - """Rebuild conjugated bonds without negatively charged atoms at the - beginning and end of the conjugated system - - Depending on the order in which atoms are read during the conversion, the - :func:`_infer_bo_and_charges` function might write conjugated systems with - a double bond less and both edges of the system as anions instead of the - usual alternating single and double bonds. This function corrects this - behaviour by using an iterative procedure. - The problematic molecules always follow the same pattern: - ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number - of successive single and double bonds. The goal of the iterative procedure - is to make ``n`` as small as possible by consecutively transforming - ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with - ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to - ``*=*-*=*``. - Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can - control that we don't transform the same triplet of atoms back and forth by - adding their indices to a list. - This function also handles conjugated systems with triple bonds. - The molecule needs to be kekulized first to also cover systems - with aromatic rings. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to transform, modified inplace - max_iter : int - Maximum number of iterations performed by the function - - Notes - ----- - The molecule is modified inplace - """ - mol.UpdatePropertyCache(strict=False) - Chem.Kekulize(mol) - # pattern used to find problematic conjugated bonds - # there's usually an even number of matches for this - pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") - # pattern used to finish fixing a series of conjugated bonds - base_end_pattern = Chem.MolFromSmarts( - "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") - # used when there's an odd number of matches for `pattern` - odd_end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," - "$([S;D4;v4]-[O-])]") - # number of unique matches with the pattern - n_matches = len(set([match[0] - for match in mol.GetSubstructMatches(pattern)])) - # nothing to standardize - if n_matches == 0: - return - # single match (unusual) - elif n_matches == 1: - # as a last resort, the only way to standardize is to find a nitrogen - # that can accept a double bond and a positive charge - # or a carbonyl that will become an enolate - end_pattern = odd_end_pattern - # at least 2 matches - else: - # give priority to base end pattern and then deal with the odd one if - # necessary - end_pattern = base_end_pattern - backtrack = [] - backtrack_cycles = 0 - for _ in range(max_iter): - # check for ending pattern - end_match = mol.GetSubstructMatch(end_pattern) - if end_match: - # index of each atom - anion1, a1, a2, anion2 = end_match - term_atom = mol.GetAtomWithIdx(anion2) - # edge-case 1: C-[O-] or [N+]-[O-] - # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] - # transform the =O to -[O-] - if ( - term_atom.GetAtomicNum() == 6 - and term_atom.GetFormalCharge() == 0 - ) or ( - term_atom.GetAtomicNum() == 7 - and term_atom.GetFormalCharge() == 1 - ): - for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - bond.GetBondTypeAsDouble() == 2): - bond.SetBondType(Chem.BondType.SINGLE) - neighbor.SetFormalCharge(-1) - break - # edge-case 2: S=O - # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O - # transform -[O-] to =O - elif (term_atom.GetAtomicNum() == 16 and - term_atom.GetFormalCharge() == 0): - for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - neighbor.GetFormalCharge() == -1 and - bond.GetBondTypeAsDouble() == 1): - bond.SetBondType(Chem.BondType.DOUBLE) - neighbor.SetFormalCharge(0) - break - # not an edge case: - # increment the charge - else: - term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) - # common to all cases: - # [*-]-*=*-[R] --> *=*-*=[R] - # increment the charge and switch single and double bonds - a = mol.GetAtomWithIdx(anion1) - a.SetFormalCharge(a.GetFormalCharge() + 1) - b = mol.GetBondBetweenAtoms(anion1, a1) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - b = mol.GetBondBetweenAtoms(a1, a2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) - b = mol.GetBondBetweenAtoms(a2, anion2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - mol.UpdatePropertyCache(strict=False) - continue - - # switch the position of the charge and the double bond - matches = mol.GetSubstructMatches(pattern) - if matches: - # check if we haven't already transformed this triplet - for match in matches: - # store order-independent atom indices - g = set(match) - # already transformed --> try the next one - if g in backtrack: - continue - # add to backtracking and start the switch - else: - anion, a1, a2 = match - backtrack.append(g) - break - # already performed all changes - else: - if backtrack_cycles == 1: - # might be stuck because there were 2 "odd" end patterns - # misqualified as a single base one - end_pattern = odd_end_pattern - elif backtrack_cycles > 1: - # likely stuck changing the same bonds over and over so - # let's stop here - mol.UpdatePropertyCache(strict=False) - return - match = matches[0] - anion, a1, a2 = match - backtrack = [set(match)] - backtrack_cycles += 1 - - # switch charges - a = mol.GetAtomWithIdx(anion) - a.SetFormalCharge(a.GetFormalCharge() + 1) - a = mol.GetAtomWithIdx(a2) - a.SetFormalCharge(a.GetFormalCharge() - 1) - # switch bond orders - b = mol.GetBondBetweenAtoms(anion, a1) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - b = mol.GetBondBetweenAtoms(a1, a2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) - mol.UpdatePropertyCache(strict=False) - # update number of matches for the end pattern - n_matches = len(set([match[0] for match in matches])) - if n_matches == 1: - end_pattern = odd_end_pattern - # start new iteration - continue - - # no more changes to apply - mol.UpdatePropertyCache(strict=False) - return - - # reached max_iter - warnings.warn("The standardization could not be completed within a " - "reasonable number of iterations") diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py new file mode 100644 index 00000000000..c009ff60185 --- /dev/null +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -0,0 +1,703 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 +# +# MDAnalysis --- https://www.mdanalysis.org +# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors +# (see the file AUTHORS for the full list of names) +# +# Released under the GNU Lesser General Public License, v2 or higher +# +# Please cite your use of MDAnalysis in published work: +# +# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, +# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. +# MDAnalysis: A Python package for the rapid analysis of molecular dynamics +# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th +# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. +# doi: 10.25080/majora-629e541a-00e +# +# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. +# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. +# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 +# + +"""RDKit bond order inferring --- :mod:`MDAnalysis.converters.RDKitInferring` +============================================================================= +Bond order and formal charge inferring for RDKit molecules. + +Classes +------- + +.. autoclass:: MDAnalysisInferer + :members: + :private-members: + +.. autoclass:: TemplateInferer + :members: + +.. autoclass:: RDKitInferer + :members: + +.. autofunction:: sanitize_mol + +.. autofunction:: reorder_atoms + +""" +import warnings +from collections import defaultdict +from contextlib import suppress +from dataclasses import dataclass +from typing import ClassVar, Dict, List, Optional, Tuple + +import numpy as np + +with suppress(ImportError): + from rdkit import Chem + from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate + from rdkit.Chem.rdChemReactions import ChemicalReaction, ReactionFromSmarts + + RDBONDORDER = { + 1: Chem.BondType.SINGLE, + 1.5: Chem.BondType.AROMATIC, + "ar": Chem.BondType.AROMATIC, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } + # add string version of the key for each bond + RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) + PERIODIC_TABLE = Chem.GetPeriodicTable() + +with suppress(ImportError): + from rdkit.Chem.rdDetermineBonds import ( + DetermineBondOrders, + ) # available since 2022.09.1 + + +def reorder_atoms( + mol: "Chem.Mol", field: str = "_MDAnalysis_index" +) -> "Chem.Mol": + """Reorder atoms based on the given field. Defaults to sorting in the same + order as the input AtomGroup. + + Notes + ----- + If the field is not found on the molecule, skips reordering. + """ + with suppress(KeyError): + order = np.argsort([atom.GetIntProp(field) for atom in mol.GetAtoms()]) + return Chem.RenumberAtoms(mol, order.astype(int).tolist()) + # not a molecule converted by MDAnalysis + warnings.warn( + f"{field!r} not available on the input mol atoms, skipping reordering " + "of atoms." + ) + return mol + + +def sanitize_mol(mol: "Chem.Mol") -> None: + """Tries to sanitize the molecule, and if an error is raised, logs it as a + warning. + """ + err = Chem.SanitizeMol(mol, catchErrors=True) + if err: + warnings.warn( + f"Could not sanitize molecule: failed during step {err!r}" + ) + + +@dataclass(frozen=True) +class MDAnalysisInferer: + """Bond order and formal charge inferring as originally implemented for + the RDKit converter. + + Attributes + ---------- + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :meth:`~MDAnalysisInferer._rebuild_conjugated_bonds` + MONATOMIC_CATION_CHARGES : ClassVar[Dict[int, int]] + Charges that should be assigned to monatomic cations. Maps atomic + number to their formal charge. Anion charges are directly handled by + the code using the typical valence of the atom. + STANDARDIZATION_REACTIONS : ClassVar[List[str]] + Reactions uses by :meth:`~MDAnalysisInferer._standardize_patterns` to + fix challenging cases must have single reactant and product, and + cannot add any atom. + + .. versionadded:: 2.8.0 + """ + + MONATOMIC_CATION_CHARGES: ClassVar[Dict[int, int]] = { + 3: 1, + 11: 1, + 19: 1, + 37: 1, + 47: 1, + 55: 1, + 12: 2, + 20: 2, + 29: 2, + 30: 2, + 38: 2, + 56: 2, + 26: 2, # Fe could also be +3 + 13: 3, + } + STANDARDIZATION_REACTIONS: ClassVar[List[str]] = [ + "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm + "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm + "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate + "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG + "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" + ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP + "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone + "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N + ] + + max_iter: int = 200 + + def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": + """Infer bond orders and formal charges in the molecule. + + Parameters + ---------- + mol : Chem.Mol + The molecule to infer bond orders and charges for. + + Returns + ------- + A new molecule with proper bond orders and charges. + """ + self._infer_bo_and_charges(mol) + mol = self._standardize_patterns(mol) + mol = reorder_atoms(mol) + sanitize_mol(mol) + return mol + + @classmethod + def _atom_sorter(cls, atom: "Chem.Atom") -> Tuple[int, int]: + """Sorts atoms in the molecule in a way that makes it easy for the bond + order and charge infering code to get the correct state on the first + try. Currently sorts by number of unpaired electrons, then by number of + heavy atom neighbors (i.e. atoms at the edge first).""" + num_heavy_neighbors = len( + [ + neighbor + for neighbor in atom.GetNeighbors() + if neighbor.GetAtomicNum() > 1 + ] + ) + return (-cls._get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) + + @classmethod + def _infer_bo_and_charges(cls, mol: "Chem.Mol") -> None: + """Infer bond orders and formal charges from a molecule. + + Since most MD topology files don't explicitly retain information on + bond orders or charges, it has to be guessed from the topology. This + is done by looping over each atom and comparing its expected valence + to the current valence to get the Number of Unpaired Electrons (NUE). + If an atom has a negative NUE, it needs a positive formal charge + (-NUE). If two neighbouring atoms have UEs, the bond between them most + likely has to be increased by the value of the smallest NUE. + If after this process, an atom still has UEs, it needs a negative + formal charge of -NUE. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule is modified inplace and must have all hydrogens added + + Notes + ----- + This algorithm is order dependant. For example, for a carboxylate group + ``R-C(-O)-O`` the first oxygen read will receive a double bond and the + other one will be charged. It will also affect more complex conjugated + systems. + """ + # heavy atoms sorted by number of heavy atom neighbors (lower first) + # then NUE (higher first) + atoms = sorted( + [atom for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1], + key=cls._atom_sorter, + ) + + for atom in atoms: + # monatomic ions + if atom.GetDegree() == 0: + atom.SetFormalCharge( + cls.MONATOMIC_CATION_CHARGES.get( + atom.GetAtomicNum(), + -cls._get_nb_unpaired_electrons(atom)[0], + ) + ) + mol.UpdatePropertyCache(strict=False) + continue + # get NUE for each possible valence + nue = cls._get_nb_unpaired_electrons(atom) + # if there's only one possible valence state and the corresponding + # NUE is negative, it means we can only add a positive charge to + # the atom + if (len(nue) == 1) and (nue[0] < 0): + atom.SetFormalCharge(-nue[0]) + mol.UpdatePropertyCache(strict=False) + # go to next atom if above case or atom has no unpaired electron + if (len(nue) == 1) and (nue[0] <= 0): + continue + else: + neighbors = sorted( + atom.GetNeighbors(), + reverse=True, + key=lambda a: cls._get_nb_unpaired_electrons(a)[0], + ) + # check if one of the neighbors has a common NUE + for na in neighbors: + # get NUE for the neighbor + na_nue = cls._get_nb_unpaired_electrons(na) + # smallest common NUE + common_nue = min( + min([i for i in nue if i >= 0], default=0), + min([i for i in na_nue if i >= 0], default=0), + ) + # a common NUE of 0 means we don't need to do anything + if common_nue != 0: + # increase bond order + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx() + ) + order = common_nue + 1 + bond.SetBondType(RDBONDORDER[order]) + mol.UpdatePropertyCache(strict=False) + # go to next atom if one of the valences is complete + nue = cls._get_nb_unpaired_electrons(atom) + if any([n == 0 for n in nue]): + break + + # if atom valence is still not filled + nue = cls._get_nb_unpaired_electrons(atom) + if not any([n == 0 for n in nue]): + # transform nue to charge + atom.SetFormalCharge(-nue[0]) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) + + @staticmethod + def _get_nb_unpaired_electrons(atom: "Chem.Atom") -> List[int]: + """Calculate the number of unpaired electrons (NUE) of an atom + + Parameters + ---------- + atom: rdkit.Chem.rdchem.Atom + The atom for which the NUE will be computed + + Returns + ------- + nue : List[int] + The NUE for each possible valence of the atom + """ + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() - atom.GetFormalCharge() + return [v - current_v for v in expected_vs] + + def _standardize_patterns( + self, mol: "Chem.Mol", max_iter: Optional[int] = None + ) -> "Chem.Mol": + """Standardizes functional groups + + Uses :meth:`~MDAnalysisInferer._rebuild_conjugated_bonds` to + standardize conjugated systems, and SMARTS reactions for other + functional groups. + Due to the way reactions work, we first have to split the molecule by + fragments. Then, for each fragment, we apply the standardization + reactions. Finally, the fragments are recombined. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to standardize + max_iter : Optional[int] + Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. + Maximum number of iterations to standardize conjugated systems + + Returns + ------- + mol : rdkit.Chem.rdchem.Mol + The standardized molecule + + Notes + ----- + The following functional groups are transformed in this order: + + +---------------+------------------------------------------------------------------------------+ + | Name | Reaction | + +===============+==============================================================================+ + | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+------------------------------------------------------------------------------+ + | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+------------------------------------------------------------------------------+ + | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | + +---------------+------------------------------------------------------------------------------+ + | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | + +---------------+------------------------------------------------------------------------------+ + | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | + +---------------+------------------------------------------------------------------------------+ + | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | + +---------------+------------------------------------------------------------------------------+ + | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| + +---------------+------------------------------------------------------------------------------+ + | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | + +---------------+------------------------------------------------------------------------------+ + | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | + +---------------+------------------------------------------------------------------------------+ + + """ + if max_iter is None: + max_iter = self.max_iter + else: + warnings.warn( + "Specifying `max_iter` is deprecated and will be removed in a " + "future update. Directly create an instance of " + "`MDAnalysisInferer` with `MDAnalysisInferer(max_iter=...)` " + "instead.", + DeprecationWarning, + ) + + # standardize conjugated systems + self._rebuild_conjugated_bonds(mol, max_iter) + + # list of sanitized reactions + reactions = [ + ReactionFromSmarts(rxn) for rxn in self.STANDARDIZATION_REACTIONS + ] + + # fragment mol (reactions must have single reactant and product) + fragments = list(Chem.GetMolFrags(mol, asMols=True)) + for reactant in fragments: + self._apply_reactions(reactions, reactant) + + # recombine fragments + newmol = fragments.pop(0) + for fragment in fragments: + newmol = Chem.CombineMols(newmol, fragment) + + return newmol + + @staticmethod + def _apply_reactions( + reactions: List["ChemicalReaction"], reactant: "Chem.Mol" + ) -> None: + """Applies a series of unimolecular reactions to a molecule. The + reactions must not add any atom to the product. The molecule is + modified inplace. + + Parameters + ---------- + reactions : List[rdkit.Chem.rdChemReactions.ChemicalReaction] + Reactions from SMARTS. Each reaction is applied until no more + transformations can be made. + reactant : rdkit.Chem.rdchem.Mol + The molecule to transform inplace + + """ + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + for reaction in reactions: + while reaction.RunReactantInPlace(reactant): + reactant.UpdatePropertyCache(strict=False) + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + + def _rebuild_conjugated_bonds( + self, mol: "Chem.Mol", max_iter: Optional[int] = None + ) -> None: + """Rebuild conjugated bonds without negatively charged atoms at the + beginning and end of the conjugated system + + Depending on the order in which atoms are read during the conversion, + the :meth:`~MDAnalysisInferer._infer_bo_and_charges` function might + write conjugated systems with a double bond less and both edges of the + system as anions instead of the usual alternating single and double + bonds. This function corrects this behaviour by using an iterative + procedure. + The problematic molecules always follow the same pattern: + ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the + number of successive single and double bonds. The goal of the + iterative procedure is to make ``n`` as small as possible by + consecutively transforming ``anion-*=*`` into ``*=*-anion`` until it + reaches the smallest pattern with ``n=1``. This last pattern is then + transformed ``anion-*=*-anion`` to ``*=*-*=*``. + Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, + we can control that we don't transform the same triplet of atoms back + and forth by adding their indices to a list. + This function also handles conjugated systems with triple bonds. + The molecule needs to be kekulized first to also cover systems + with aromatic rings. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to transform, modified inplace + max_iter : Optional[int] + Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. + Maximum number of iterations to standardize conjugated systems + + Notes + ----- + The molecule is modified inplace + """ + max_iter = self.max_iter if max_iter is None else max_iter + mol.UpdatePropertyCache(strict=False) + Chem.Kekulize(mol) + # pattern used to find problematic conjugated bonds + # there's usually an even number of matches for this + pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") + # pattern used to finish fixing a series of conjugated bonds + base_end_pattern = Chem.MolFromSmarts( + "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]" + ) + # used when there's an odd number of matches for `pattern` + odd_end_pattern = Chem.MolFromSmarts( + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," + "$([S;D4;v4]-[O-])]" + ) + # number of unique matches with the pattern + n_matches = len( + set([match[0] for match in mol.GetSubstructMatches(pattern)]) + ) + # nothing to standardize + if n_matches == 0: + return + # single match (unusual) + elif n_matches == 1: + # as a last resort, the only way to standardize is to find a + # nitrogen that can accept a double bond and a positive charge + # or a carbonyl that will become an enolate + end_pattern = odd_end_pattern + # at least 2 matches + else: + # give priority to base end pattern and then deal with the odd one + # if necessary + end_pattern = base_end_pattern + backtrack = [] + backtrack_cycles = 0 + for _ in range(max_iter): + # check for ending pattern + end_match = mol.GetSubstructMatch(end_pattern) + if end_match: + # index of each atom + anion1, a1, a2, anion2 = end_match + term_atom = mol.GetAtomWithIdx(anion2) + # edge-case 1: C-[O-] or [N+]-[O-] + # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] + # transform the =O to -[O-] + if ( + term_atom.GetAtomicNum() == 6 + and term_atom.GetFormalCharge() == 0 + ) or ( + term_atom.GetAtomicNum() == 7 + and term_atom.GetFormalCharge() == 1 + ): + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms( + anion2, neighbor.GetIdx() + ) + if ( + neighbor.GetAtomicNum() == 8 + and bond.GetBondTypeAsDouble() == 2 + ): + bond.SetBondType(Chem.BondType.SINGLE) + neighbor.SetFormalCharge(-1) + break + # edge-case 2: S=O + # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O + # transform -[O-] to =O + elif ( + term_atom.GetAtomicNum() == 16 + and term_atom.GetFormalCharge() == 0 + ): + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms( + anion2, neighbor.GetIdx() + ) + if ( + neighbor.GetAtomicNum() == 8 + and neighbor.GetFormalCharge() == -1 + and bond.GetBondTypeAsDouble() == 1 + ): + bond.SetBondType(Chem.BondType.DOUBLE) + neighbor.SetFormalCharge(0) + break + # not an edge case: + # increment the charge + else: + term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) + # common to all cases: + # [*-]-*=*-[R] --> *=*-*=[R] + # increment the charge and switch single and double bonds + a = mol.GetAtomWithIdx(anion1) + a.SetFormalCharge(a.GetFormalCharge() + 1) + b = mol.GetBondBetweenAtoms(anion1, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + b = mol.GetBondBetweenAtoms(a2, anion2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + mol.UpdatePropertyCache(strict=False) + continue + + # switch the position of the charge and the double bond + matches = mol.GetSubstructMatches(pattern) + if matches: + # check if we haven't already transformed this triplet + for match in matches: + # store order-independent atom indices + g = set(match) + # already transformed --> try the next one + if g in backtrack: + continue + # add to backtracking and start the switch + else: + anion, a1, a2 = match + backtrack.append(g) + break + # already performed all changes + else: + if backtrack_cycles == 1: + # might be stuck because there were 2 "odd" end + # patterns misqualified as a single base one + end_pattern = odd_end_pattern + elif backtrack_cycles > 1: + # likely stuck changing the same bonds over and over so + # let's stop here + mol.UpdatePropertyCache(strict=False) + return + match = matches[0] + anion, a1, a2 = match + backtrack = [set(match)] + backtrack_cycles += 1 + + # switch charges + a = mol.GetAtomWithIdx(anion) + a.SetFormalCharge(a.GetFormalCharge() + 1) + a = mol.GetAtomWithIdx(a2) + a.SetFormalCharge(a.GetFormalCharge() - 1) + # switch bond orders + b = mol.GetBondBetweenAtoms(anion, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + mol.UpdatePropertyCache(strict=False) + # update number of matches for the end pattern + n_matches = len(set([match[0] for match in matches])) + if n_matches == 1: + end_pattern = odd_end_pattern + # start new iteration + continue + + # no more changes to apply + mol.UpdatePropertyCache(strict=False) + return + + # reached max_iter + warnings.warn( + "The standardization could not be completed within a " + "reasonable number of iterations" + ) + + +@dataclass(frozen=True) +class TemplateInferer: + """Infer bond orders and charges by matching the molecule with a template + molecule containing bond orders and charges. + + Attributes + ---------- + template : rdkit.Chem.rdchem.Mol + Molecule containing the bond orders and charges. + adjust_hydrogens: bool, default = True + If ``True``, temporarily removes hydrogens from the molecule to + assign bond orders and charges from the template, then adds them + back. Useful to avoid adding explicit hydrogens on the template which + can prevent RDKit from finding a match between the template and the + molecule. + + .. versionadded:: 2.8.0 + """ + + template: "Chem.Mol" + adjust_hydrogens: bool = True + + def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": + if self.adjust_hydrogens: + return self.assign_from_template_with_adjusted_hydrogens(mol) + return AssignBondOrdersFromTemplate(self.template, mol) + + def assign_from_template_with_adjusted_hydrogens( + self, mol: "Chem.Mol", index_field: str = "_MDAnalysis_index" + ) -> "Chem.Mol": + """Temporarily removes hydrogens from the molecule before assigning + bond orders and charges from the template, then adds them back. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.Mol + Molecule to assign bond orders and charges to. + index_field : str, default = "_MDAnalysis_index" + Atom property corresponding to a unique integer that can used to + put back the hydrogen atoms that were removed before matching, + and sort them back to the original order. Must be accessible + through ``atom.GetIntProp``. + """ + # remove explicit hydrogens and assign BO from template + mol_no_h = Chem.RemoveAllHs(mol, sanitize=False) + new = AssignBondOrdersFromTemplate(self.template, mol_no_h) + # mapping between AtomGroup index of atom bearing H --> H atom(s) + atoms_with_hydrogens = defaultdict(list) + for atom in mol.GetAtoms(): + if atom.GetAtomicNum() == 1: + atoms_with_hydrogens[ + atom.GetNeighbors()[0].GetIntProp(index_field) + ].append(atom) + # mapping between atom that should be bearing a H in RWMol and + # corresponding H(s) + reverse_mapping = {} + for atom in new.GetAtoms(): + if (idx := atom.GetIntProp(index_field)) in atoms_with_hydrogens: + reverse_mapping[atom.GetIdx()] = atoms_with_hydrogens[idx] + # Set ExplicitH to 0 to avoid valence errors when adding Hs + atom.SetNumExplicitHs(0) + # let UpdatePropertyCache recalculate the number of radicals later + atom.SetNumRadicalElectrons(0) + # add missing Hs + rwmol = Chem.RWMol(new) + for atom_idx, hydrogens in reverse_mapping.items(): + for hydrogen in hydrogens: + h_idx = rwmol.AddAtom(hydrogen) + rwmol.AddBond(atom_idx, h_idx, Chem.BondType.SINGLE) + new = rwmol.GetMol() + # sanitize + new.UpdatePropertyCache() + sanitize_mol(new) + # reorder atoms as input atomgroup (through _MDAnalysis_index) + return reorder_atoms(new, field=index_field) + + +@dataclass(frozen=True) +class RDKitInferer: + """Uses RDKit's :func:`~rdkit.Chem.rdDetermineBonds.DetermineBondOrders` + to infer bond orders and formal charges. This is the same algorithm used + by the :ref:`xyz2mol ` package. + + .. versionadded:: 2.8.0 + """ + + charge: int = 0 + + def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": + new = Chem.Mol(mol) + DetermineBondOrders(new, charge=self.charge) + return new diff --git a/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst b/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst index 760446fd1d5..fa4525f69b6 100644 --- a/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst +++ b/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst @@ -1,3 +1,5 @@ .. automodule:: MDAnalysis.converters.RDKitParser .. automodule:: MDAnalysis.converters.RDKit + +.. automodule:: MDAnalysis.converters.RDKitInferring diff --git a/package/pyproject.toml b/package/pyproject.toml index d1e8e741c5a..84cc1db4faf 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -79,7 +79,7 @@ extra_formats = [ "pyedr>=0.7.0", "pytng>=0.2.3", "gsd>3.0.0", - "rdkit>=2020.03.1", + "rdkit>=2022.09.1", ] analysis = [ "biopython>=1.80", @@ -119,4 +119,4 @@ find = {} [tool.setuptools.package-data] MDAnalysis = [ 'analysis/data/*.npy', -] +] \ No newline at end of file diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 85860162f7b..4a8d40733dd 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -23,28 +23,31 @@ import copy import warnings +from contextlib import suppress from io import StringIO +from unittest.mock import Mock import MDAnalysis as mda import numpy as np import pytest from MDAnalysis.topology.guessers import guess_atom_element -from MDAnalysisTests.datafiles import GRO, PDB_full, PDB_helix, mol2_molecule +from MDAnalysisTests.datafiles import (GRO, TRR, PDB_full, PDB_helix, + mol2_molecule) from MDAnalysisTests.util import import_not_available -from numpy.testing import assert_allclose, assert_equal +from numpy.testing import assert_allclose, assert_array_equal, assert_equal -try: +with suppress(ImportError): from MDAnalysis.converters.RDKit import (RDATTRIBUTES, _add_mda_attr_to_rdkit, - _atom_sorter, - _infer_bo_and_charges, - _rebuild_conjugated_bonds, _set_atom_property, - _standardize_patterns) + atomgroup_to_mol) + from MDAnalysis.converters.RDKitInferring import (MDAnalysisInferer, + RDKitInferer, + TemplateInferer, + reorder_atoms, + sanitize_mol) from rdkit import Chem from rdkit.Chem import AllChem -except ImportError: - pass requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), @@ -71,7 +74,7 @@ def mol2_mol(): def smiles_mol(): mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") mol = Chem.AddHs(mol) - cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) + AllChem.EmbedMultipleConfs(mol, numConfs=3) return mol @staticmethod @@ -148,14 +151,14 @@ def mol2(self): # add elements elements = np.array([guess_atom_element(x) for x in u.atoms.types], dtype=object) - u.add_TopologyAttr('elements', elements) + u.add_TopologyAttr("elements", elements) return u @pytest.fixture def peptide(self): u = mda.Universe(GRO) elements = mda.topology.guessers.guess_types(u.atoms.names) - u.add_TopologyAttr('elements', elements) + u.add_TopologyAttr("elements", elements) return u.select_atoms("resid 2-12") @pytest.fixture @@ -171,7 +174,7 @@ def test_no_atoms_attr(self): def test_single_atom_mol(self, smi): u = mda.Universe.from_smiles(smi, addHs=False, generate_coordinates=False) - mol = u.atoms.convert_to.rdkit(NoImplicit=False) + mol = u.atoms.convert_to.rdkit(inferer=None) assert mol.GetNumAtoms() == 1 assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") @@ -194,7 +197,7 @@ def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) mda_atom = sel.atoms[atom_index] - umol = sel.convert_to.rdkit(NoImplicit=False) + umol = sel.convert_to.rdkit(inferer=None) atom = umol.GetAtomWithIdx(atom_index) mi = atom.GetMonomerInfo() assert mda_atom.altLoc == mi.GetAltLoc() @@ -227,7 +230,7 @@ def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) # Delete topology attribute (PR #3069) - u.del_TopologyAttr('elements') + u.del_TopologyAttr("elements") with pytest.raises( AttributeError, @@ -244,7 +247,7 @@ def test_warn_guess_bonds(self): def test_bonds_outside_sel(self): u = mda.Universe(Chem.MolFromSmiles("CC(=O)C")) ag = u.select_atoms("index 1") - ag.convert_to.rdkit(NoImplicit=False) + ag.convert_to.rdkit(inferer=None) def test_error_no_hydrogen(self, uo2): with pytest.raises(AttributeError, @@ -255,12 +258,12 @@ def test_error_no_hydrogen(self, uo2): def test_error_no_hydrogen_implicit(self, uo2): with warnings.catch_warnings(): warnings.simplefilter("error") - uo2.atoms.convert_to.rdkit(NoImplicit=False) + uo2.atoms.convert_to.rdkit(inferer=None) def test_warning_no_hydrogen_force(self, uo2): with pytest.warns(UserWarning, match="Forcing to continue the conversion"): - uo2.atoms.convert_to.rdkit(NoImplicit=False, force=True) + uo2.atoms.convert_to.rdkit(force=True) @pytest.mark.parametrize("attr, value, expected", [ ("names", "N", " N "), @@ -301,13 +304,13 @@ def test_other_attributes(self, mol2, idx): ]) def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) - mol = ag.convert_to.rdkit(NoImplicit=False) + mol = ag.convert_to.rdkit(inferer=None) expected = [i for i in range(len(ag))] indices = [a.GetIntProp("_MDAnalysis_index") for a in mol.GetAtoms()] assert_equal(indices, expected) def test_assign_coordinates(self, pdb): - mol = pdb.atoms.convert_to.rdkit(NoImplicit=False) + mol = pdb.atoms.convert_to.rdkit(inferer=None) positions = mol.GetConformer().GetPositions() assert_allclose(positions, pdb.atoms.positions) @@ -379,7 +382,7 @@ def test_cache(self): assert cached_func.cache_info().maxsize == 3 # test (4): caching is sensitive to converter arguments previous_cache = cached_func.cache_info() - uc.atoms.convert_to.rdkit(NoImplicit=False) + uc.atoms.convert_to.rdkit(implicit_hydrogens=True) cache = cached_func.cache_info() assert cache.misses == previous_cache.misses + 1 # test (5): skip cache @@ -388,20 +391,67 @@ def test_cache(self): assert cache == new_cache def test_sanitize_fail_warning(self): - mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", sanitize=False) - mol.UpdatePropertyCache(strict=False) - mol = Chem.AddHs(mol) + params = Chem.SmilesParserParams() + params.removeHs = False + params.sanitize = False + mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", params) u = mda.Universe(mol) - with pytest.warns(UserWarning, match="Could not sanitize molecule"): - u.atoms.convert_to.rdkit(NoImplicit=False) - with warnings.catch_warnings(): - warnings.filterwarnings("error", "Could not sanitize molecule") - u.atoms.convert_to.rdkit() + with pytest.warns() as record: + u.atoms.convert_to.rdkit(inferer=None) + assert "Could not sanitize molecule" not in "\n".join( + [str(r.message)for r in record]) + + def test_deprecation_max_iter(self, mol2, monkeypatch): + mock = Mock(wraps=atomgroup_to_mol) + monkeypatch.setattr("MDAnalysis.converters.RDKit.atomgroup_to_mol", + mock) + with pytest.warns(DeprecationWarning, + match="Using `max_iter` is deprecated"): + mol2.atoms.convert_to.rdkit(max_iter=2) + assert mock.call_args.kwargs["inferer"].max_iter == 2 + + def test_deprecation_NoImplicit(self, mol2): + with pytest.warns(DeprecationWarning, + match="Using `NoImplicit` is deprecated"): + mol2.atoms.convert_to.rdkit(NoImplicit=True) + + def test_deprecation_atomgroup_to_mol_NoImplicit(self, mol2): + with pytest.warns(DeprecationWarning, + match="Using `NoImplicit` is deprecated"): + atomgroup_to_mol(mol2.atoms, NoImplicit=False) + + def test_atomgroup_to_mol_unexpected_kwargs(self, mol2): + with pytest.raises(ValueError, + match="Found unexpected arguments: {'foo': 'bar'}"): + atomgroup_to_mol(mol2.atoms, foo="bar") + + def test_custom_callable_inferer(self, mol2): + def potatoe(mol): + new = Chem.Mol(mol) + new.SetProp("_Name", "🥔") + return new + + mol = mol2.atoms.convert_to.rdkit(inferer=potatoe) + assert mol.GetProp("_Name") == "🥔" @requires_rdkit -class TestRDKitFunctions(object): - +class TestRDKitInferringFunctions: + def test_sanitize_mol_warning(self): + mol = Chem.MolFromSmiles("[NH4]", sanitize=False) + with pytest.warns(match="Could not sanitize molecule: failed during " + "step rdkit.Chem.rdmolops.SanitizeFlags" + ".SANITIZE_PROPERTIES"): + sanitize_mol(mol) + + def test_reorder_atoms_key_error(self): + mol = Chem.MolFromSmiles("CCO") + with pytest.warns(match="'_MDAnalysis_index' not available on the" + " input mol atoms, skipping reordering of atoms"): + reorder_atoms(mol) + + +class BaseInferer: def add_Hs_remove_bo_and_charges(self, mol): """Add hydrogens and remove bond orders and charges from a molecule""" mH = Chem.AddHs(mol) @@ -415,6 +465,19 @@ def add_Hs_remove_bo_and_charges(self, mol): mH.UpdatePropertyCache(strict=False) return mH + def assert_isomorphic_resonance_structure(self, mol, ref): + """ + Checks if 2 molecules are isomorphic using their resonance structures + """ + isomorphic = mol.HasSubstructMatch(ref) + if not isomorphic: + isomorphic = bool(Chem.ResonanceMolSupplier(mol) + .GetSubstructMatch(ref)) + assert isomorphic, f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" + + +@requires_rdkit +class TestRDKitMDAnalysisInferer(BaseInferer): def enumerate_reordered_mol(self, mol): """Enumerates all possible starting atoms for a given molecule""" # go through each possible starting atom @@ -426,22 +489,9 @@ def enumerate_reordered_mol(self, mol): reordered_mol.UpdatePropertyCache(strict=False) yield reordered_mol - def assign_bond_orders_and_charges(self, mol): - """Returns a sanitized molecule with infered bond orders and charges""" - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol) - Chem.SanitizeMol(mol) - return mol - - def assert_isomorphic_resonance_structure(self, mol, ref): - """Checks if 2 molecules are isomorphic using their resonance - structures - """ - isomorphic = mol.HasSubstructMatch(ref) - if not isomorphic: - isomorphic = bool(Chem.ResonanceMolSupplier(mol) - .GetSubstructMatch(ref)) - assert isomorphic, f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" + @pytest.fixture(scope="class") + def inferer(self): + return MDAnalysisInferer() @pytest.mark.parametrize("smi, out", [ ("C(-[H])(-[H])(-[H])-[H]", "C"), @@ -462,11 +512,12 @@ def assert_isomorphic_resonance_structure(self, mol, ref): def test_infer_bond_orders(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol) + inferer = MDAnalysisInferer() + inferer._infer_bo_and_charges(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) + assert is_isomorphic(mol, molref), f"{Chem.MolToSmiles(mol)} != {out}" @pytest.mark.parametrize("smi, atom_idx, charge", [ ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), @@ -477,7 +528,8 @@ def test_infer_bond_orders(self, smi, out): def test_infer_charges(self, smi, atom_idx, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol) + inferer = MDAnalysisInferer() + inferer._infer_bo_and_charges(mol) Chem.SanitizeMol(mol) assert mol.GetAtomWithIdx(atom_idx).GetFormalCharge() == charge @@ -500,13 +552,13 @@ def test_infer_charges(self, smi, atom_idx, charge): ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), ]) - def test_standardize_patterns(self, smi, out): + def test_standardize_patterns(self, inferer, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - mol = self.assign_bond_orders_and_charges(mol) + mol = inferer(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) + assert is_isomorphic(mol, molref), f"{Chem.MolToSmiles(mol)} != {out}" @pytest.mark.parametrize("attr, value, getter", [ ("index", 42, "GetIntProp"), @@ -539,7 +591,7 @@ def test_ignore_prop(self): "c1cc[nH]c1", "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", ]) - def test_transfer_properties(self, smi): + def test_transfer_properties(self, inferer, smi): mol = Chem.MolFromSmiles(smi) mol = self.add_Hs_remove_bo_and_charges(mol) old = {} @@ -548,7 +600,7 @@ def test_transfer_properties(self, smi): atom.SetIntProp("_MDAnalysis_index", ix) atom.SetProp("dummy", f"foo_{ix}") old[ix] = {"_MDAnalysis_index": ix, "dummy": f"foo_{ix}"} - newmol = self.assign_bond_orders_and_charges(mol) + newmol = inferer(mol) new = {} for a in newmol.GetAtoms(): ix = a.GetIntProp("_MDAnalysis_index") @@ -632,13 +684,13 @@ def test_transfer_properties(self, smi): "[O-]/N=C1/c2ccccc2-c2nc3ccc(C(F)(F)F)cc3nc21 CHEMBL4557878", "N#Cc1c[nH]c(C(=O)Nc2ccc(-c3cccc[n+]3[O-])cc2C2=CCCCC2)n1 CHEMBL1172116", ]) - def test_order_independant(self, smi): + def test_order_independant(self, inferer, smi): # generate mol with hydrogens but without bond orders ref = Chem.MolFromSmiles(smi) stripped_mol = self.add_Hs_remove_bo_and_charges(ref) # enumerate mols with different starting atoms for m in self.enumerate_reordered_mol(stripped_mol): - m = self.assign_bond_orders_and_charges(m) + m = inferer(m) m = Chem.RemoveHs(m) self.assert_isomorphic_resonance_structure(m, ref) @@ -655,9 +707,19 @@ def test_order_independant_issue_3339(self, smi): def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) + inferer = MDAnalysisInferer(max_iter=2) with pytest.warns(UserWarning, match="reasonable number of iterations"): - _rebuild_conjugated_bonds(mol, 2) + inferer._rebuild_conjugated_bonds(mol) + + def test_deprecation_warning_max_iter(self): + smi = "c1ccccc1" + mol = Chem.MolFromSmiles(smi) + inferer = MDAnalysisInferer() + with pytest.warns(DeprecationWarning, + match="Specifying `max_iter` is deprecated and will" + " be removed"): + inferer._standardize_patterns(mol, max_iter=1) @pytest.mark.parametrize("smi", [ "[Li+]", "[Na+]", "[K+]", "[Rb+]", "[Ag+]", "[Cs+]", @@ -667,10 +729,10 @@ def test_warn_conjugated_max_iter(self): "[O-2]", "[Na+].[Cl-]", ]) - def test_ions(self, smi): + def test_ions(self, inferer, smi): ref = Chem.MolFromSmiles(smi) stripped_mol = self.add_Hs_remove_bo_and_charges(ref) - mol = self.assign_bond_orders_and_charges(stripped_mol) + mol = inferer(stripped_mol) assert is_isomorphic(mol, ref) @pytest.mark.parametrize("smi", [ @@ -717,6 +779,128 @@ def test_atom_sorter(self): # atom indices: 1 3 5 6 mol.UpdatePropertyCache() sorted_atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], key=_atom_sorter) + if atom.GetAtomicNum() > 1], + key=MDAnalysisInferer._atom_sorter) sorted_indices = [atom.GetIdx() for atom in sorted_atoms] assert sorted_indices == [6, 5, 1, 3] + + +@requires_rdkit +class TestRDKitTemplateInferer: + @pytest.fixture(scope="function") + def template(self): + return Chem.MolFromSmiles("[NH3+]-C=O") + + @pytest.fixture(scope="function") + def template_with_hydrogens(self): + params = Chem.SmilesParserParams() + params.removeHs = False + return Chem.MolFromSmiles("[N+](-[H])(-[H])(-[H])-C(-[H])=O", params) + + @pytest.fixture(scope="function") + def mol(self, template): + mol = Chem.AddHs(template) + AllChem.EmbedMolecule(mol, randomSeed=0xAC1D1C) + mol.GetAtomWithIdx(0).SetFormalCharge(0) + mol.UpdatePropertyCache(strict=False) + for atom in mol.GetAtoms(): + atom.SetIntProp("_MDAnalysis_index", atom.GetIdx()) + return mol + + def test_template_inferring(self, mol, template_with_hydrogens): + inferer = TemplateInferer( + template=template_with_hydrogens, adjust_hydrogens=False + ) + new = inferer(mol) + assert new.GetAtomWithIdx(0).GetFormalCharge() == 1 + + def test_template_inferring_adjust_hydrogens(self, mol, template): + inferer = TemplateInferer(template=template, adjust_hydrogens=True) + new = inferer(mol) + assert new.GetAtomWithIdx(0).GetFormalCharge() == 1 + + def test_from_convert_to_method(self, template_with_hydrogens): + mol = Chem.Mol(template_with_hydrogens) + AllChem.EmbedMolecule(mol, randomSeed=0xAC1D1C) + u = mda.Universe(mol) + mol = u.atoms.convert_to.rdkit( + inferer=TemplateInferer( + template=template_with_hydrogens, adjust_hydrogens=False + ) + ) + assert mol.GetAtomWithIdx(0).GetFormalCharge() == 1 + xyz = mol.GetConformer().GetPositions() + assert_array_equal(xyz, u.atoms.positions) + + def test_adjust_hydrogens_bigger_mol(self): + u = mda.Universe(GRO, TRR) + elements = mda.topology.guessers.guess_types(u.atoms.names) + u.add_TopologyAttr("elements", elements) + ag = u.select_atoms("resid 1-3") + inferer = TemplateInferer( + template=Chem.MolFromSmiles( + "CCC(C)C(C=O)NC(=O)C(CCCNC(N)=[NH2+])NC(=O)C([NH3+])CCSC" + ), + adjust_hydrogens=True, + ) + mol = ag.convert_to.rdkit(inferer=inferer) + xyz = mol.GetConformer().GetPositions() + assert_array_equal(xyz, ag.positions) + + +@requires_rdkit +class TestRDKitInferer(BaseInferer): + @pytest.mark.parametrize( + "smi", + [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", + # amino acids + "C[C@H](N)C(=O)O", # A + "NCC(=O)O", # G + "CC[C@H](C)[C@H](N)C(=O)O", # I + "CC(C)C[C@H](N)C(=O)O", # L + "O=C(O)[C@@H]1CCCN1", # P + "CC(C)[C@H](N)C(=O)O", # V + "N[C@@H](Cc1ccccc1)C(=O)O", # F + "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W + "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y + "N[C@@H](CC(=O)O)C(=O)O", # D + "N[C@@H](CCC(=O)O)C(=O)O", # E + "N=C(N)NCCC[C@H](N)C(=O)O", # R + "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H + "NCCCC[C@H](N)C(=O)O", # K + "N[C@@H](CO)C(=O)O", # S + "C[C@@H](O)[C@H](N)C(=O)O", # T + "N[C@@H](CS)C(=O)O", # C + "CSCC[C@H](N)C(=O)O", # M + "NC(=O)C[C@H](N)C(=O)O", # N + "NC(=O)CC[C@H](N)C(=O)O", # Q + # nucleic acids + "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A + "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T + "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U + "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C + "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G + ], + ) + def test_inferring(self, smi): + # generate mol with hydrogens but without bond orders + ref = Chem.MolFromSmiles(smi) + charge = sum(atom.GetFormalCharge() for atom in ref.GetAtoms()) + inferer = RDKitInferer(charge=charge) + mol = Chem.AddHs(ref) + AllChem.EmbedMolecule(mol, randomSeed=0xAC1D1C) + stripped_mol = self.add_Hs_remove_bo_and_charges(mol) + new = inferer(stripped_mol) + new = Chem.RemoveHs(new) + self.assert_isomorphic_resonance_structure(new, ref)