From 9ebc1b23e83eaa270a3241fe2ede21afacbdb17a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 21 Aug 2020 17:14:30 -0700 Subject: [PATCH] Towards #2468 (RDKit interoperability GSOC project) * Overview of work done in PR This PR adds the `RDKitConverter` class which converts MDAnalysis `Universe`/`Atomgroup` objects to `RDKit rdchem.Mol` objects. * Limitations - Bonds and elements must be present (the former will be inferred if not present). - This converter mainly aims at supporting cases where explicit hydrogens are present. * Extra implementation details - Bond order and formal charges are inferred via atomic valencies & the number of unpaired electrons (see `_infer_bo_and_charges`). - In part due to the influence of atom read order on inferring, bond conjugation and functional groups are standardized using SMARTS reactions (see `_standardize_patterns`). * Other changes Also includes some PEP8 changes and some cleaning up of the tests for the `RDKITReader`. * References For more information on the development process for this PR, see: 1) https://cbouy.github.io/blog/2020/07/01/rdkit-converter 2) https://cbouy.github.io/blog/2020/07/22/rdkit-converter-part2 --- package/CHANGELOG | 2 + package/MDAnalysis/coordinates/RDKit.py | 769 +++++++++++++++++- package/doc/sphinx/source/conf.py | 1 + .../source/documentation_pages/converters.rst | 1 + .../converters/RDKitParser.rst | 3 + .../topology/RDKitParser.rst | 1 - .../documentation_pages/topology_modules.rst | 1 - .../MDAnalysisTests/coordinates/test_rdkit.py | 498 +++++++++++- 8 files changed, 1229 insertions(+), 47 deletions(-) create mode 100644 package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst delete mode 100644 package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst diff --git a/package/CHANGELOG b/package/CHANGELOG index 1df27904a8e..9df19de4a28 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -79,6 +79,8 @@ Enhancements * Added new kwargs `select_remove` and `select_protein` to analysis.dihedrals.Janin analysis to give user more fine grained control over selections (PR #2899) + * Added an RDKit converter that works for any input with all hydrogens + explicit in the topology (Issue #2468, PR #2775) Changes * deprecated NumPy type aliases have been replaced with their actual types diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 3c64241d109..54184371912 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -21,25 +21,30 @@ # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 # -"""RDKit molecule --- :mod:`MDAnalysis.coordinates.RDKit` +"""RDKit molecule I/O --- :mod:`MDAnalysis.coordinates.RDKit` ================================================================ -Read coordinates data from an `RDKit `_ :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` -into a MDAnalysis Universe. Convert it back to a :class:`rdkit.Chem.rdchem.Mol` with -:class:`RDKitConverter`. +Read coordinates data from an `RDKit `__ :class:`rdkit.Chem.rdchem.Mol` with +:class:`RDKitReader` into an MDAnalysis Universe. Convert it back to a +:class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. Example ------- ->>> from rdkit import Chem ->>> import MDAnalysis as mda ->>> mol = Chem.MolFromMol2File("docking_poses.mol2", removeHs=False) ->>> u = mda.Universe(mol) ->>> u - ->>> u.trajectory - +To read an RDKit molecule and then convert the AtomGroup back to an RDKit +molecule:: + + >>> from rdkit import Chem + >>> import MDAnalysis as mda + >>> mol = Chem.MolFromMol2File("docking_poses.mol2", removeHs=False) + >>> u = mda.Universe(mol) + >>> u + + >>> u.trajectory + + >>> u.atoms.convert_to("RDKIT") + Classes @@ -51,19 +56,59 @@ .. autoclass:: RDKitConverter :members: +.. autofunction:: _infer_bo_and_charges + +.. autofunction:: _standardize_patterns + +.. autofunction:: _rebuild_conjugated_bonds """ import warnings +import re +import copy import numpy as np +from ..exceptions import NoDataError +from ..topology.guessers import guess_atom_element +from ..core.topologyattrs import _TOPOLOGY_ATTRS from . import memory +from . import base + +try: + from rdkit import Chem + from rdkit.Chem import AllChem +except ImportError: + pass +else: + RDBONDORDER = { + 1: Chem.BondType.SINGLE, + 1.5: Chem.BondType.AROMATIC, + "ar": Chem.BondType.AROMATIC, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } + # add string version of the key for each bond + RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) + RDATTRIBUTES = { + "altLocs": "AltLoc", + "chainIDs": "ChainId", + "icodes": "InsertionCode", + "names": "Name", + "occupancies": "Occupancy", + "resnames": "ResidueName", + "resids": "ResidueNumber", + "segindices": "SegmentNumber", + "tempfactors": "TempFactor", + "bfactors": "TempFactor", + } + PERIODIC_TABLE = Chem.GetPeriodicTable() class RDKitReader(memory.MemoryReader): """Coordinate reader for RDKit. - + .. versionadded:: 2.0.0 """ format = 'RDKIT' @@ -84,21 +129,709 @@ def _format_hint(thing): def __init__(self, filename, **kwargs): """Read coordinates from an RDKit molecule. - Each conformer in the original RDKit molecule will be read as a frame + Each conformer in the original RDKit molecule will be read as a frame in the resulting universe. Parameters ---------- - filename : rdkit.Chem.rdchem.Mol RDKit molecule """ n_atoms = filename.GetNumAtoms() coordinates = np.array([ - conf.GetPositions() for conf in filename.GetConformers()], + conf.GetPositions() for conf in filename.GetConformers()], dtype=np.float32) if coordinates.size == 0: warnings.warn("No coordinates found in the RDKit molecule") - coordinates = np.empty((1,n_atoms,3), dtype=np.float32) + coordinates = np.empty((1, n_atoms, 3), dtype=np.float32) coordinates[:] = np.nan - super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) \ No newline at end of file + super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) + + +class RDKitConverter(base.ConverterBase): + """Convert MDAnalysis :class:`~MDAnalysis.core.groups.AtomGroup` or + :class:`~MDAnalysis.core.universe.Universe` to RDKit + :class:`~rdkit.Chem.rdchem.Mol` + + MDanalysis attributes are stored in each RDKit + :class:`~rdkit.Chem.rdchem.Atom` of the resulting molecule in two different + ways: + + * in an :class:`~rdkit.Chem.rdchem.AtomPDBResidueInfo` object available + through the :meth:`~rdkit.Chem.rdchem.Atom.GetMonomerInfo` method if it's + an attribute that is typically found in a PDB file, + * directly as an atom property available through the + :meth:`~rdkit.Chem.rdchem.Atom.GetProp` methods for the others. + + Supported attributes: + + +-----------------------+-------------------------------------------+ + | MDAnalysis attribute | RDKit | + +=======================+===========================================+ + | altLocs | atom.GetMonomerInfo().GetAltLoc() | + +-----------------------+-------------------------------------------+ + | chainIDs | atom.GetMonomerInfo().GetChainId() | + +-----------------------+-------------------------------------------+ + | icodes | atom.GetMonomerInfo().GetInsertionCode() | + +-----------------------+-------------------------------------------+ + | names | atom.GetMonomerInfo().GetName() | + +-----------------------+-------------------------------------------+ + | occupancies | atom.GetMonomerInfo().GetOccupancy() | + +-----------------------+-------------------------------------------+ + | resnames | atom.GetMonomerInfo().GetResidueName() | + +-----------------------+-------------------------------------------+ + | resids | atom.GetMonomerInfo().GetResidueNumber() | + +-----------------------+-------------------------------------------+ + | segindices | atom.GetMonomerInfo().GetSegmentNumber() | + +-----------------------+-------------------------------------------+ + | tempfactors | atom.GetMonomerInfo().GetTempFactor() | + +-----------------------+-------------------------------------------+ + | bfactors | atom.GetMonomerInfo().GetTempFactor() | + +-----------------------+-------------------------------------------+ + | charges | atom.GetDoubleProp("_MDAnalysis_charge") | + +-----------------------+-------------------------------------------+ + | indices | atom.GetIntProp("_MDAnalysis_index") | + +-----------------------+-------------------------------------------+ + | segids | atom.GetProp("_MDAnalysis_segid") | + +-----------------------+-------------------------------------------+ + | types | atom.GetProp("_MDAnalysis_type") | + +-----------------------+-------------------------------------------+ + + Example + ------- + + To access MDAnalysis properties:: + + >>> import MDAnalysis as mda + >>> from MDAnalysis.tests.datafiles import PDB_full + >>> u = mda.Universe(PDB_full) + >>> mol = u.select_atoms('resname DMS').convert_to('RDKIT') + >>> mol.GetAtomWithIdx(0).GetMonomerInfo().GetResidueName() + 'DMS' + + To create a molecule for each frame of a trajectory:: + + from MDAnalysisTests.datafiles import PSF, DCD + from rdkit.Chem.Descriptors3D import Asphericity + + u = mda.Universe(PSF, DCD) + elements = mda.topology.guessers.guess_types(u.atoms.names) + u.add_TopologyAttr('elements', elements) + ag = u.select_atoms("resid 1-10") + + for ts in u.trajectory: + mol = ag.convert_to("RDKIT") + x = Asphericity(mol) + + + Notes + ----- + + The converter requires the :class:`~MDAnalysis.core.topologyattrs.Elements` + attribute to be present in the topology, else it will fail. + + It also requires the `bonds` attribute, although they will be automatically + guessed if not present. + + If both ``tempfactors`` and ``bfactors`` attributes are present, the + conversion will fail, since only one of these should be present. + Refer to Issue #1901 for a solution + + Hydrogens should be explicit in the topology file. If this is not the case, + use the parameter ``NoImplicit=False`` when using the converter to allow + implicit hydrogens and disable inferring bond orders and charges. + + Since one of the main use case of the converter is converting trajectories + and not just a topology, creating a new molecule from scratch for every + frame would be too slow so the converter uses a caching system. The cache + only remembers the id of the last AtomGroup that was converted, as well + as the arguments that were passed to the converter. This means that using + ``u.select_atoms("protein").convert_to("RDKIT")`` will not benefit from the + cache since the selection is deleted from memory as soon as the conversion + is finished. Instead, users should do this in two steps by first saving the + selection in a variable and then converting the saved AtomGroup. It also + means that ``ag.convert_to("RDKIT")`` followed by + ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the cache. + Finally if you're modifying the AtomGroup in place between two conversions, + the id of the AtomGroup won't change and thus the converter will use the + cached molecule. For this reason, you can pass a ``cache=False`` argument + to the converter to bypass the caching system. + Note that the cached molecule doesn't contain the coordinates of the atoms. + + + .. versionadded:: 2.0.0 + + """ + + lib = 'RDKIT' + units = {'time': None, 'length': 'Angstrom'} + _cache = dict() + + def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): + """Write selection at current trajectory frame to + :class:`~rdkit.Chem.rdchem.Mol`. + + Parameters + ----------- + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` + + cache : bool + Use a cached copy of the molecule's topology when available. To be + used, the cached molecule and the new one have to be made from the + same AtomGroup object (same id) and with the same arguments passed + to the converter (with the exception of this `cache` argument) + NoImplicit : bool + Prevent adding hydrogens to the molecule + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :func:`_rebuild_conjugated_bonds` + """ + # parameters passed to atomgroup_to_mol and used by the cache + kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter) + + try: + from rdkit import Chem + except ImportError: + raise ImportError("RDKit is required for the RDKitConverter but " + "it's not installed. Try installing it with \n" + "conda install -c conda-forge rdkit") + try: + # make sure to use atoms (Issue 46) + ag = obj.atoms + except AttributeError: + raise TypeError("No `atoms` attribute in object of type {}, " + "please use a valid AtomGroup or Universe".format( + type(obj))) from None + + if cache: + # key used to search the cache + key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}" + for key, value in kwargs.items()) + try: + mol = self._cache[key] + except KeyError: + # only keep the current molecule in cache + self._cache.clear() + # create the topology + self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs) + # continue on copy of the cached molecule + mol = copy.deepcopy(mol) + else: + self._cache.clear() + mol = self.atomgroup_to_mol(ag, **kwargs) + + # add a conformer for the current Timestep + if hasattr(ag, "positions"): + if np.isnan(ag.positions).any(): + warnings.warn("NaN detected in coordinates, the output " + "molecule will not have 3D coordinates assigned") + else: + # assign coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for atom in mol.GetAtoms(): + idx = atom.GetIntProp("_MDAnalysis_index") + xyz = ag.positions[idx].astype(float) + conf.SetAtomPosition(atom.GetIdx(), xyz) + mol.AddConformer(conf) + # assign R/S to atoms and Z/E to bonds + Chem.AssignStereochemistryFrom3D(mol) + Chem.SetDoubleBondNeighborDirections(mol) + + return mol + + def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200): + """Converts an AtomGroup to an RDKit molecule. + + Parameters + ----------- + ag : MDAnalysis.core.groups.AtomGroup + The AtomGroup to convert + NoImplicit : bool + Prevent adding hydrogens to the molecule + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :func:`_rebuild_conjugated_bonds` + """ + try: + elements = ag.elements + except NoDataError: + raise AttributeError( + "The `elements` attribute is required for the RDKitConverter " + "but is not present in this AtomGroup. Please refer to the " + "documentation to guess elements from other attributes or " + "type `help(mda.topology.guessers)`") from None + + if "H" not in ag.elements: + warnings.warn( + "No hydrogen atom could be found in the topology, but the " + "converter requires all hydrogens to be explicit. Please " + "check carefully the output molecule as the converter is " + "likely to add negative charges and assign incorrect bond " + "orders to structures with implicit hydrogens. Alternatively, " + "you can use the parameter `NoImplicit=False` when using the " + "converter to allow implicit hydrogens and disable inferring " + "bond orders and charges." + ) + + # attributes accepted in PDBResidueInfo object + pdb_attrs = {} + if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): + raise AttributeError( + "Both `tempfactors` and `bfactors` attributes are present but " + "only one can be assigned to the RDKit molecule. Please " + "delete the unnecessary one and retry." + ) + for attr in RDATTRIBUTES.keys(): + if hasattr(ag, attr): + pdb_attrs[attr] = getattr(ag, attr) + + other_attrs = {} + for attr in ["charges", "segids", "types"]: + if hasattr(ag, attr): + other_attrs[attr] = getattr(ag, attr) + + mol = Chem.RWMol() + # map index in universe to index in mol + atom_mapper = {} + + for i, (atom, element) in enumerate(zip(ag, elements)): + # create atom + rdatom = Chem.Atom(element.capitalize()) + # enable/disable adding implicit H to the molecule + rdatom.SetNoImplicit(NoImplicit) + # add PDB-like properties + mi = Chem.AtomPDBResidueInfo() + for attr, values in pdb_attrs.items(): + _add_mda_attr_to_rdkit(attr, values[i], mi) + rdatom.SetMonomerInfo(mi) + # other properties + for attr in other_attrs.keys(): + value = other_attrs[attr][i] + attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular + _set_atom_property(rdatom, attr, value) + _set_atom_property(rdatom, "_MDAnalysis_index", i) + # add atom + index = mol.AddAtom(rdatom) + atom_mapper[atom.ix] = index + + try: + ag.bonds + except NoDataError: + warnings.warn( + "No `bonds` attribute in this AtomGroup. Guessing bonds based " + "on atoms coordinates") + ag.guess_bonds() + + for bond in ag.bonds: + try: + bond_indices = [atom_mapper[i] for i in bond.indices] + except KeyError: + continue + bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) + mol.AddBond(*bond_indices, bond_type) + + mol.UpdatePropertyCache(strict=False) + + if NoImplicit: + # infer bond orders and formal charges from the connectivity + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol, max_iter) + + # sanitize + Chem.SanitizeMol(mol) + + return mol + + +def _add_mda_attr_to_rdkit(attr, value, mi): + """Converts an MDAnalysis atom attribute into the RDKit equivalent and + stores it into an RDKit :class:`~rdkit.Chem.rdchem.AtomPDBResidueInfo`. + + Parameters + ---------- + attr : str + Name of the atom attribute in MDAnalysis in the singular form + value : object, np.int or np.float + Attribute value as found in the AtomGroup + mi : rdkit.Chem.rdchem.AtomPDBResidueInfo + MonomerInfo object that will store the relevant atom attributes + """ + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = value.item() + if attr == "names": + # RDKit needs the name to be properly formatted for a + # PDB file (1 letter elements start at col 14) + name = re.findall(r'(\D+|\d+)', value) + if len(name) == 2: + symbol, number = name + if len(number) > 2 and len(symbol) == 1: + value = "{}{}".format(symbol, number) + else: + value = "{:>2.2}{:<2.2}".format(symbol, number) + else: + # no number in the name + value = " {:<}".format(name[0]) + + # set attribute value in RDKit MonomerInfo + rdattr = RDATTRIBUTES[attr] + getattr(mi, "Set%s" % rdattr)(value) + + +def _set_atom_property(atom, attr, value): + """Saves any attribute and value into an RDKit atom property""" + if isinstance(value, (float, np.float)): + atom.SetDoubleProp(attr, float(value)) + elif isinstance(value, (int, np.int)): + atom.SetIntProp(attr, int(value)) + else: + atom.SetProp(attr, value) + + +def _infer_bo_and_charges(mol): + """Infer bond orders and formal charges from a molecule. + + Since most MD topology files don't explicitly retain information on bond + orders or charges, it has to be guessed from the topology. This is done by + looping over each atom and comparing its expected valence to the current + valence to get the Number of Unpaired Electrons (NUE). + If an atom has a negative NUE, it needs a positive formal charge (-NUE). + If two neighbouring atoms have UEs, the bond between them most + likely has to be increased by the value of the smallest NUE. + If after this process, an atom still has UEs, it needs a negative formal + charge of -NUE. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule is modified inplace and must have all hydrogens added + + Notes + ----- + This algorithm is order dependant. For example, for a carboxylate group + R-C(-O)-O the first oxygen read will receive a double bond and the other + one will be charged. It will also affect more complex conjugated systems. + """ + + for atom in sorted(mol.GetAtoms(), reverse=True, + key=lambda a: _get_nb_unpaired_electrons(a)[0]): + # get NUE for each possible valence + nue = _get_nb_unpaired_electrons(atom) + # if there's only one possible valence state and the corresponding + # NUE is negative, it means we can only add a positive charge to + # the atom + if (len(nue) == 1) and (nue[0] < 0): + atom.SetFormalCharge(-nue[0]) + mol.UpdatePropertyCache(strict=False) + # go to next atom if above case or atom has no unpaired electron + if (len(nue) == 1) and (nue[0] <= 0): + continue + else: + neighbors = sorted(atom.GetNeighbors(), reverse=True, + key=lambda a: _get_nb_unpaired_electrons(a)[0]) + # check if one of the neighbors has a common NUE + for i, na in enumerate(neighbors, start=1): + # get NUE for the neighbor + na_nue = _get_nb_unpaired_electrons(na) + # smallest common NUE + common_nue = min( + min([i for i in nue if i >= 0], default=0), + min([i for i in na_nue if i >= 0], default=0) + ) + # a common NUE of 0 means we don't need to do anything + if common_nue != 0: + # increase bond order + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx()) + order = common_nue + 1 + bond.SetBondType(RDBONDORDER[order]) + mol.UpdatePropertyCache(strict=False) + if i < len(neighbors): + # recalculate nue for atom + nue = _get_nb_unpaired_electrons(atom) + + # if the atom still has unpaired electrons + nue = _get_nb_unpaired_electrons(atom)[0] + if nue > 0: + # transform it to a negative charge + atom.SetFormalCharge(-nue) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) + + +def _get_nb_unpaired_electrons(atom): + """Calculate the number of unpaired electrons (NUE) of an atom + + Parameters + ---------- + atom: rdkit.Chem.rdchem.Atom + The atom for which the NUE will be computed + + Returns + ------- + nue : list + The NUE for each possible valence of the atom + """ + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() - atom.GetFormalCharge() + return [v - current_v for v in expected_vs] + + +def _standardize_patterns(mol, max_iter=200): + """Standardizes functional groups + + Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, + and SMARTS reactions for other functional groups. + Due to the way reactions work, we first have to split the molecule by + fragments. Then, for each fragment, we apply the standardization reactions. + Finally, the fragments are recombined. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to standardize + max_iter : int + Maximum number of iterations to standardize conjugated systems + + Returns + ------- + mol : rdkit.Chem.rdchem.Mol + The standardized molecule + + Notes + ----- + The following functional groups are transformed in this order: + + +---------------+-----------------------------------------------------------------------------+ + | Name | Reaction | + +===============+=============================================================================+ + | conjugated | ``[*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+-----------------------------------------------------------------------------+ + | conjugated-N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+-----------------------------------------------------------------------------+ + | conjugated-O- | ``[O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+-----------------------------------------------------------------------------+ + | Cterm | ``[C-;X2:1]=[O:2]>>[C+0:1]=[O:2]`` | + +---------------+-----------------------------------------------------------------------------+ + | Nterm | ``[N-;X2;H1:1]>>[N+0:1]`` | + +---------------+-----------------------------------------------------------------------------+ + | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | + +---------------+-----------------------------------------------------------------------------+ + | arginine | ``[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4]`` | + +---------------+-----------------------------------------------------------------------------+ + | sulfone | ``[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3]`` | + +---------------+-----------------------------------------------------------------------------+ + | nitro | ``[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3]`` | + +---------------+-----------------------------------------------------------------------------+ + + """ + + # standardize conjugated systems + _rebuild_conjugated_bonds(mol, max_iter) + + fragments = [] + for reactant in Chem.GetMolFrags(mol, asMols=True): + + for name, reaction in [ + ("Cterm", "[C-;X2:1]=[O:2]>>[C+0:1]=[O:2]"), + ("Nterm", "[N-;X2;H1:1]>>[N+0:1]"), + ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), + ("ARG", "[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]" + ">>[N:1]-[C+0:2](-[N:3])=[N+:4]"), + ("sulfone", "[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]" + ">>[S:1](=[O+0:2])=[O+0:3]"), + ("nitro", "[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]" + ">>[N+:1](-[O-:2])=[O+0:3]"), + ]: + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + reactant = _run_reaction(reaction, reactant) + + fragments.append(reactant) + + # recombine fragments + mol = fragments.pop(0) + for fragment in fragments: + mol = Chem.CombineMols(mol, fragment) + + return mol + + +def _run_reaction(reaction, reactant): + """Runs a reaction until all reactants are transformed + + If a pattern is matched N times in the molecule, the reaction will return N + products as an array of shape (N, 1). Only the first product will be kept + and the same reaction will be reapplied to the product N times in total. + + Parameters + ---------- + reaction : str + SMARTS reaction + reactant : rdkit.Chem.rdchem.Mol + The molecule to transform + + Returns + ------- + product : rdkit.Chem.rdchem.Mol + The final product of the reaction + """ + # count how many times the reaction should be run + pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) + n_matches = len(reactant.GetSubstructMatches(pattern)) + + # run the reaction for each matched pattern + rxn = AllChem.ReactionFromSmarts(reaction) + for n in range(n_matches): + products = rxn.RunReactants((reactant,)) + # only keep the first product + if products: + product = products[0][0] + # map back atom properties from the reactant to the product + _reassign_props_after_reaction(reactant, product) + # apply the next reaction to the product + product.UpdatePropertyCache(strict=False) + reactant = product + else: + # exit the n_matches loop if there's no product. Example + # where this is needed: SO^{4}_{2-} will match the sulfone + # pattern 6 times but the reaction is only needed once + break + return reactant + + +def _rebuild_conjugated_bonds(mol, max_iter=200): + """Rebuild conjugated bonds without negatively charged atoms at the + beginning and end of the conjugated system + + Depending on the order in which atoms are read during the conversion, the + :func:`_infer_bo_and_charges` function might write conjugated systems with + a double bond less and both edges of the system as anions instead of the + usual alternating single and double bonds. This function corrects this + behaviour by using an iterative procedure. + The problematic molecules always follow the same pattern: + ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number + of successive single and double bonds. The goal of the iterative procedure + is to make ``n`` as small as possible by consecutively transforming + ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with + ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to + ``*=*-*=*``. + Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can + control that we don't transform the same triplet of atoms back and forth by + adding their indices to a list. + The molecule needs to be kekulized first to also cover systems + with aromatic rings. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to transform, modified inplace + max_iter : int + Maximum number of iterations performed by the function + """ + mol.UpdatePropertyCache(strict=False) + Chem.Kekulize(mol) + pattern = Chem.MolFromSmarts("[*-;!O]-[*+0]=[*+0]") + # number of unique matches with the pattern + n_matches = len(set([match[0] + for match in mol.GetSubstructMatches(pattern)])) + if n_matches == 0: + # nothing to standardize + return + # check if there's an even number of anion-*=* patterns + elif n_matches % 2 == 0: + end_pattern = Chem.MolFromSmarts("[*-;!O]-[*+0]=[*+0]-[*-]") + else: + # as a last resort, the only way to standardize is to find a nitrogen + # that can accept a double bond and a positive charge + # or a carbonyl that will become an enolate + end_pattern = Chem.MolFromSmarts( + "[*-;!O]-[*+0]=[*+0]-[$([#7;X3;v3]),$([#6+0]=O)]") + backtrack = [] + for _ in range(max_iter): + # simplest case where n=1 + end_match = mol.GetSubstructMatch(end_pattern) + if end_match: + # index of each atom + anion1, a1, a2, anion2 = end_match + term_atom = mol.GetAtomWithIdx(anion2) + # [*-]-*=*-C=O + if term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0: + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if neighbor.GetAtomicNum() == 8 and bond.GetBondTypeAsDouble() == 2: + bond.SetBondType(Chem.BondType.SINGLE) + neighbor.SetFormalCharge(-1) + else: + # [*-]-*=*-N + if term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 0: + end_charge = 1 + # [*-]-*=*-[*-] + else: + end_charge = 0 + mol.GetAtomWithIdx(anion2).SetFormalCharge(end_charge) + # common part of the conjugated systems: [*-]-*=* + mol.GetAtomWithIdx(anion1).SetFormalCharge(0) + mol.GetBondBetweenAtoms(anion1, a1).SetBondType( + Chem.BondType.DOUBLE) + mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) + mol.GetBondBetweenAtoms(a2, anion2).SetBondType( + Chem.BondType.DOUBLE) + mol.UpdatePropertyCache(strict=False) + + # shorten the anion-anion pattern from n to n-1 + matches = mol.GetSubstructMatches(pattern) + if matches: + # check if we haven't already transformed this triplet + for match in matches: + # sort the indices for the comparison + g = tuple(sorted(match)) + if g in backtrack: + # already transformed + continue + else: + # take the first one that hasn't been tried + anion, a1, a2 = match + backtrack.append(g) + break + else: + anion, a1, a2 = matches[0] + # charges + mol.GetAtomWithIdx(anion).SetFormalCharge(0) + mol.GetAtomWithIdx(a2).SetFormalCharge(-1) + # bonds + mol.GetBondBetweenAtoms(anion, a1).SetBondType( + Chem.BondType.DOUBLE) + mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) + mol.UpdatePropertyCache(strict=False) + # start new iteration + continue + + # no more changes to apply + return + + # reached max_iter + warnings.warn("The standardization could not be completed within a " + "reasonable number of iterations") + + +def _reassign_props_after_reaction(reactant, product): + """Maps back atomic properties from the reactant to the product. + The product molecule is modified inplace. + """ + for atom in product.GetAtoms(): + try: + atom.GetIntProp("old_mapno") + except KeyError: + pass + else: + atom.ClearProp("old_mapno") + idx = atom.GetUnsignedProp("react_atom_idx") + old_atom = reactant.GetAtomWithIdx(idx) + for prop, value in old_atom.GetPropsAsDict().items(): + _set_atom_property(atom, prop, value) + # fix bonds with "crossed" stereo + for bond in atom.GetBonds(): + if bond.GetStereo() == Chem.BondStereo.STEREOANY: + bond.SetStereo(Chem.BondStereo.STEREONONE) + atom.ClearProp("react_atom_idx") diff --git a/package/doc/sphinx/source/conf.py b/package/doc/sphinx/source/conf.py index 296b479d6f9..a149c769a8f 100644 --- a/package/doc/sphinx/source/conf.py +++ b/package/doc/sphinx/source/conf.py @@ -346,4 +346,5 @@ 'https://gsd.readthedocs.io/en/stable/': None, 'https://parmed.github.io/ParmEd/html/': None, 'https://docs.h5py.org/en/stable': None, + 'https://www.rdkit.org/docs/': None, } diff --git a/package/doc/sphinx/source/documentation_pages/converters.rst b/package/doc/sphinx/source/documentation_pages/converters.rst index 8bdcb913fc0..c70d2324184 100644 --- a/package/doc/sphinx/source/documentation_pages/converters.rst +++ b/package/doc/sphinx/source/documentation_pages/converters.rst @@ -33,4 +33,5 @@ you will have to specify a package name (case-insensitive). :: :maxdepth: 1 converters/ParmEdParser + converters/RDKitParser diff --git a/package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst b/package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst new file mode 100644 index 00000000000..174a1ff1115 --- /dev/null +++ b/package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst @@ -0,0 +1,3 @@ +.. automodule:: MDAnalysis.topology.RDKitParser + +.. automodule:: MDAnalysis.coordinates.RDKit diff --git a/package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst b/package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst deleted file mode 100644 index 9f6bbd1dac9..00000000000 --- a/package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst +++ /dev/null @@ -1 +0,0 @@ -.. automodule:: MDAnalysis.topology.RDKitParser \ No newline at end of file diff --git a/package/doc/sphinx/source/documentation_pages/topology_modules.rst b/package/doc/sphinx/source/documentation_pages/topology_modules.rst index 04ebf433e27..ed8caba8ce6 100644 --- a/package/doc/sphinx/source/documentation_pages/topology_modules.rst +++ b/package/doc/sphinx/source/documentation_pages/topology_modules.rst @@ -43,7 +43,6 @@ topology file format in the *topology_format* keyword argument to topology/PDBQTParser topology/PQRParser topology/PSFParser - topology/RDKitParser topology/TOPParser topology/TPRParser topology/TXYZParser diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 7bdb845f95a..930e65fa730 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -20,59 +20,503 @@ # MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 # -import warnings +import copy import pytest import MDAnalysis as mda +from MDAnalysis.topology.guessers import guess_atom_element import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) -from MDAnalysisTests.datafiles import mol2_molecule +from MDAnalysisTests.datafiles import mol2_molecule, PDB_full, GRO, PDB_helix +from MDAnalysisTests.util import import_not_available -Chem = pytest.importorskip("rdkit.Chem") -AllChem = pytest.importorskip("rdkit.Chem.AllChem") -def mol2_mol(): - return Chem.MolFromMol2File(mol2_molecule, removeHs=False) +try: + from rdkit import Chem + from rdkit.Chem import AllChem + from MDAnalysis.coordinates.RDKit import ( + RDATTRIBUTES, + _add_mda_attr_to_rdkit, + _infer_bo_and_charges, + _standardize_patterns, + _rebuild_conjugated_bonds, + _set_atom_property, + _reassign_props_after_reaction, + ) +except ImportError: + pass -def smiles_mol(): - mol = Chem.MolFromSmiles("CCO") - mol = Chem.AddHs(mol) - cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) - return mol +requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), + reason="requires RDKit") + + +@pytest.mark.skipif(not import_not_available("rdkit"), + reason="only for min dependencies build") +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + u = mda.Universe(PDB_full) + with pytest.raises(ImportError, + match="RDKit is required for the RDKitConverter"): + u.atoms.convert_to("RDKIT") + + +@requires_rdkit +class MolFactory: + def mol2_mol(): + return Chem.MolFromMol2File(mol2_molecule, removeHs=False) + + def smiles_mol(): + mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") + mol = Chem.AddHs(mol) + cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) + return mol + + def dummy_product(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetIntProp("old_mapno", 0) + atom.SetUnsignedProp("react_atom_idx", 0) + mol.AddAtom(atom) + return mol + + def dummy_product_nomap(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetUnsignedProp("react_atom_idx", 0) + mol.AddAtom(atom) + return mol + + def dummy_reactant_noprops(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + mol.AddAtom(atom) + return mol + + def dummy_reactant(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetProp("foo", "bar") + atom.SetIntProp("_MDAnalysis_index", 1) + atom.SetDoubleProp("_MDAnalysis_charge", 4.2) + atom.SetProp("_MDAnalysis_type", "C.3") + mol.AddAtom(atom) + return mol + + +@pytest.fixture(scope="function") +def rdmol(request): + return getattr(MolFactory, request.param)() + + +@pytest.fixture(scope="function") +def product(request): + return getattr(MolFactory, request.param)() + + +@requires_rdkit class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ - (mol2_mol(), 1), - (smiles_mol(), 3), - ]) + ("mol2_mol", 1), + ("smiles_mol", 3), + ], indirect=["rdmol"]) def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames expected = np.array([ - conf.GetPositions() for conf in rdmol.GetConformers()], + conf.GetPositions() for conf in rdmol.GetConformers()], dtype=np.float32) assert_equal(expected, universe.trajectory.coordinate_array) def test_no_coordinates(self): - with warnings.catch_warnings(record=True) as w: - # Cause all warnings to always be triggered. - warnings.simplefilter("always") - # Trigger a warning. + with pytest.warns(UserWarning, match="No coordinates found"): u = mda.Universe.from_smiles("CCO", generate_coordinates=False) - # Verify the warning - assert len(w) == 1 - assert "No coordinates found" in str( - w[-1].message) - expected = np.empty((1,u.atoms.n_atoms,3), dtype=np.float32) + expected = np.empty((1, u.atoms.n_atoms, 3), dtype=np.float32) expected[:] = np.nan assert_equal(u.trajectory.coordinate_array, expected) def test_compare_mol2reader(self): - universe = mda.Universe(mol2_mol()) + universe = mda.Universe(MolFactory.mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames - assert_equal(universe.trajectory.ts.positions, + assert_equal(universe.trajectory.ts.positions, mol2.trajectory.ts.positions) - + + +@requires_rdkit +class TestRDKitConverter(object): + @pytest.fixture + def pdb(self): + return mda.Universe(PDB_full) + + @pytest.fixture + def mol2(self): + u = mda.Universe(mol2_molecule) + # add elements + elements = np.array([guess_atom_element(x) for x in u.atoms.types], + dtype=object) + u.add_TopologyAttr('elements', elements) + return u + + @pytest.fixture + def peptide(self): + u = mda.Universe(GRO) + elements = mda.topology.guessers.guess_types(u.atoms.names) + u.add_TopologyAttr('elements', elements) + return u.select_atoms("resid 2-12") + + @pytest.mark.parametrize("smi", ["[H]", "C", "O", "[He]"]) + def test_single_atom_mol(self, smi): + u = mda.Universe.from_smiles(smi, addHs=False, + generate_coordinates=False) + mol = u.atoms.convert_to("RDKIT") + assert mol.GetNumAtoms() == 1 + assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") + + @pytest.mark.parametrize("resname, n_atoms, n_fragments", [ + ("PRO", 14, 1), + ("ILE", 38, 1), + ("ALA", 20, 2), + ("GLY", 21, 3), + ]) + def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): + mol = peptide.select_atoms("resname %s" % resname).convert_to("RDKIT") + assert n_atoms == mol.GetNumAtoms() + assert n_fragments == len(Chem.GetMolFrags(mol)) + + @pytest.mark.parametrize("sel_str, atom_index", [ + ("resid 1", 0), + ("resname LYS and name NZ", 1), + ("resid 34 and altloc B", 2), + ]) + def test_monomer_info(self, pdb, sel_str, atom_index): + sel = pdb.select_atoms(sel_str) + umol = sel.convert_to("RDKIT") + atom = umol.GetAtomWithIdx(atom_index) + mda_index = atom.GetIntProp("_MDAnalysis_index") + mi = atom.GetMonomerInfo() + + for mda_attr, rd_attr in RDATTRIBUTES.items(): + rd_value = getattr(mi, "Get%s" % rd_attr)() + if hasattr(sel, mda_attr): + mda_value = getattr(sel, mda_attr)[mda_index] + if mda_attr == "names": + rd_value = rd_value.strip() + assert rd_value == mda_value + + @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], + indirect=True) + def test_identical_topology(self, rdmol): + u = mda.Universe(rdmol) + umol = u.atoms.convert_to("RDKIT") + assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) + u2 = mda.Universe(umol) + assert_equal(u.atoms.bonds, u2.atoms.bonds) + assert_equal(u.atoms.elements, u2.atoms.elements) + assert_equal(u.atoms.names, u2.atoms.names) + assert_almost_equal(u.atoms.positions, u2.atoms.positions, decimal=7) + + def test_raise_requires_elements(self): + u = mda.Universe(mol2_molecule) + with pytest.raises( + AttributeError, + match="`elements` attribute is required for the RDKitConverter" + ): + u.atoms.convert_to("RDKIT") + + def test_warn_guess_bonds(self): + u = mda.Universe(PDB_helix) + with pytest.warns(UserWarning, + match="No `bonds` attribute in this AtomGroup"): + u.atoms.convert_to("RDKIT") + + def test_warn_no_hydrogen(self): + u = mda.Universe.from_smiles("O=O") + with pytest.warns( + UserWarning, + match="No hydrogen atom could be found in the topology" + ): + u.atoms.convert_to("RDKIT") + + @pytest.mark.parametrize("attr, value, expected", [ + ("names", "C1", " C1 "), + ("names", "C12", " C12"), + ("names", "Cl1", "Cl1 "), + ("altLocs", "A", "A"), + ("chainIDs", "B", "B"), + ("icodes", "C", "C"), + ("occupancies", 0.5, 0.5), + ("resnames", "LIG", "LIG"), + ("resids", 123, 123), + ("segindices", 1, 1), + ("tempfactors", 0.8, 0.8), + ("bfactors", 0.8, 0.8), + ]) + def test_add_mda_attr_to_rdkit(self, attr, value, expected): + mi = Chem.AtomPDBResidueInfo() + _add_mda_attr_to_rdkit(attr, value, mi) + rdvalue = getattr(mi, "Get%s" % RDATTRIBUTES[attr])() + assert rdvalue == expected + + def test_bfactors_tempfactors_raises_error(self): + u = mda.Universe.from_smiles("C") + bfactors = np.array(u.atoms.n_atoms*[1.0], dtype=np.float32) + u.add_TopologyAttr('bfactors', bfactors) + u.add_TopologyAttr('tempfactors', bfactors) + with pytest.raises( + AttributeError, + match="Both `tempfactors` and `bfactors` attributes are present" + ): + u.atoms.convert_to("RDKIT") + + @pytest.mark.parametrize("idx", [0, 10, 42]) + def test_other_attributes(self, mol2, idx): + mol = mol2.atoms.convert_to("RDKIT") + rdatom = mol.GetAtomWithIdx(idx) + rdprops = rdatom.GetPropsAsDict() + mda_idx = int(rdprops["_MDAnalysis_index"]) + for prop in ["charge", "segid", "type"]: + rdprop = rdprops["_MDAnalysis_%s" % prop] + mdaprop = getattr(mol2.atoms[mda_idx], prop) + assert rdprop == mdaprop + + @pytest.mark.parametrize("sel_str", [ + "resname ALA", + "resname PRO and segid A", + ]) + def test_index_property(self, pdb, sel_str): + ag = pdb.select_atoms(sel_str) + mol = ag.convert_to("RDKIT") + expected = [i for i in range(len(ag))] + indices = sorted([a.GetIntProp("_MDAnalysis_index") + for a in mol.GetAtoms()]) + assert_equal(indices, expected) + + def test_assign_coordinates(self, pdb): + mol = pdb.atoms.convert_to("RDKIT") + positions = mol.GetConformer().GetPositions() + indices = sorted(mol.GetAtoms(), + key=lambda a: a.GetIntProp("_MDAnalysis_index")) + indices = [a.GetIdx() for a in indices] + assert_almost_equal(positions[indices], pdb.atoms.positions) + + def test_assign_stereochemistry(self, mol2): + umol = mol2.atoms.convert_to("RDKIT") + rdmol = Chem.MolFromMol2File(mol2_molecule, removeHs=False) + assert rdmol.HasSubstructMatch( + umol, useChirality=True) and umol.HasSubstructMatch( + rdmol, useChirality=True) + + def test_trajectory_coords(self): + u = mda.Universe.from_smiles( + "CCO", numConfs=3, rdkit_kwargs=dict(randomSeed=42)) + for ts in u.trajectory: + mol = u.atoms.convert_to("RDKIT") + positions = mol.GetConformer().GetPositions() + indices = sorted(mol.GetAtoms(), + key=lambda a: a.GetIntProp("_MDAnalysis_index")) + indices = [a.GetIdx() for a in indices] + assert_almost_equal(positions[indices], ts.positions) + + def test_nan_coords(self): + u = mda.Universe.from_smiles("CCO") + xyz = u.atoms.positions + xyz[0][2] = np.nan + u.atoms.positions = xyz + with pytest.warns(UserWarning, match="NaN detected"): + mol = u.atoms.convert_to("RDKIT") + with pytest.raises(ValueError, match="Bad Conformer Id"): + mol.GetConformer() + + def test_cache(self): + u = mda.Universe.from_smiles("CCO", numConfs=5) + ag = u.atoms + cache = mda.coordinates.RDKit.RDKitConverter._cache + previous_cache = None + for ts in u.trajectory: + mol = ag.convert_to("RDKIT") + if previous_cache: + # the cache shouldn't change when iterating on timesteps + assert cache == previous_cache + previous_cache = copy.deepcopy(cache) + # cached molecule shouldn't store coordinates + mol = list(cache.values())[0] + with pytest.raises(ValueError, match="Bad Conformer Id"): + mol.GetConformer() + # only 1 molecule should be cached + u = mda.Universe.from_smiles("C") + mol = u.atoms.convert_to("RDKIT") + assert len(cache) == 1 + assert cache != previous_cache + # converter with kwargs + rdkit_converter = mda.coordinates.RDKit.RDKitConverter().convert + # cache should depend on passed arguments + previous_cache = copy.deepcopy(cache) + mol = rdkit_converter(u.atoms, NoImplicit=False) + assert cache != previous_cache + # skip cache + mol = rdkit_converter(u.atoms, cache=False) + assert cache == {} + + +@requires_rdkit +class TestRDKitFunctions(object): + @pytest.mark.parametrize("smi, out", [ + ("C(-[H])(-[H])(-[H])-[H]", "C"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", + "c1ccccc1"), + ("C-[C](-[H])-[O]", "C(=O)C"), + ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), + ("[N]-[C]-[H]", "N#C"), + ("C-[C](-[O]-[H])-[O]", "CC(=O)O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", "P(O)(O)(O)=O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", "P([O-])(O)(O)=O"), + ("[P](-[O]-[H])(-[O])(-[O])-[O]", "P([O-])([O-])(O)=O"), + ("[P](-[O])(-[O])(-[O])-[O]", "P([O-])([O-])([O-])=O"), + ("[H]-[O]-[N]-[O]", "ON=O"), + ("[N]-[C]-[O]", "N#C[O-]"), + ]) + def test_infer_bond_orders(self, smi, out): + mol = Chem.MolFromSmiles(smi, sanitize=False) + mol.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(mol) + Chem.SanitizeMol(mol) + mol = Chem.RemoveHs(mol) + molref = Chem.MolFromSmiles(out) + assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( + mol), "{} != {}".format(Chem.MolToSmiles(mol), out) + + @pytest.mark.parametrize("smi, atom_idx, charge", [ + ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), + ("[N]-[C]-[O]", 2, -1), + ("[N](-[H])(-[H])(-[H])-[H]", 0, 1), + ("C-[C](-[O])-[O]", 3, -1), + ]) + def test_infer_charges(self, smi, atom_idx, charge): + mol = Chem.MolFromSmiles(smi, sanitize=False) + mol.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(mol) + Chem.SanitizeMol(mol) + assert mol.GetAtomWithIdx(atom_idx).GetFormalCharge() == charge + + @pytest.mark.parametrize("smi, out", [ + ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), + ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), + ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), + ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", + "CNC(N)=[N+](-[H])-[H]"), + ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), + ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), + ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), + ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), + ("[C-](=O)-C", "[C](=O)-C"), + ("[H]-[N-]-C", "[H]-[N]-C"), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", + "[O-]c1ccccc1"), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", + "[O-]C1=CC=CC1=O"), + ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), + ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), + ]) + def test_standardize_patterns(self, smi, out): + mol = Chem.MolFromSmiles(smi, sanitize=False) + mol.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol) + Chem.SanitizeMol(mol) + mol = Chem.RemoveHs(mol) + molref = Chem.MolFromSmiles(out) + assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( + mol), "{} != {}".format(Chem.MolToSmiles(mol), out) + + @pytest.mark.parametrize("attr, value, getter", [ + ("index", 42, "GetIntProp"), + ("index", np.int(42), "GetIntProp"), + ("charge", 4.2, "GetDoubleProp"), + ("charge", np.float(4.2), "GetDoubleProp"), + ("type", "C.3", "GetProp"), + ]) + def test_set_atom_property(self, attr, value, getter): + atom = Chem.Atom(1) + prop = "_MDAnalysis_%s" % attr + _set_atom_property(atom, prop, value) + assert getattr(atom, getter)(prop) == value + + @pytest.mark.parametrize("rdmol, product, name", [ + ("dummy_reactant", "dummy_product", "props"), + ("dummy_reactant_noprops", "dummy_product", "noprops"), + ("dummy_reactant", "dummy_product_nomap", "nomap"), + ], indirect=["rdmol", "product"]) + def test_reassign_props_after_reaction(self, rdmol, product, name): + _reassign_props_after_reaction(rdmol, product) + atom = product.GetAtomWithIdx(0) + if name == "props": + assert atom.GetProp("foo") == "bar" + assert atom.GetIntProp("_MDAnalysis_index") == 1 + assert atom.GetDoubleProp("_MDAnalysis_charge") == 4.2 + assert atom.GetProp("_MDAnalysis_type") == "C.3" + with pytest.raises(KeyError, match="old_mapno"): + atom.GetIntProp("old_mapno") + with pytest.raises(KeyError, match="react_atom_idx"): + atom.GetUnsignedProp("react_atom_idx") + elif name == "noprops": + with pytest.raises(KeyError, match="old_mapno"): + atom.GetIntProp("old_mapno") + with pytest.raises(KeyError, match="react_atom_idx"): + atom.GetUnsignedProp("react_atom_idx") + elif name == "nomap": + with pytest.raises(KeyError, match="react_atom_idx"): + atom.GetUnsignedProp("react_atom_idx") + with pytest.raises(KeyError, match="_MDAnalysis_index"): + atom.GetIntProp("_MDAnalysis_index") + + @pytest.mark.parametrize("smi_in", [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", + ]) + def test_order_independant(self, smi_in): + # generate mol with hydrogens but without bond orders + ref = Chem.MolFromSmiles(smi_in) + template = Chem.AddHs(ref) + for atom in template.GetAtoms(): + atom.SetIsAromatic(False) + atom.SetFormalCharge(0) + for bond in template.GetBonds(): + bond.SetIsAromatic(False) + bond.SetBondType(Chem.BondType.SINGLE) + + # go through each possible starting atom + for a in template.GetAtoms(): + smi = Chem.MolToSmiles(template, rootedAtAtom=a.GetIdx()) + m = Chem.MolFromSmiles(smi, sanitize=False) + for atom in m.GetAtoms(): + atom.SetFormalCharge(0) + atom.SetNoImplicit(True) + m.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(m) + m = _standardize_patterns(m) + Chem.SanitizeMol(m) + m = Chem.RemoveHs(m) + assert m.HasSubstructMatch(ref) and ref.HasSubstructMatch( + m), "Failed when starting from atom %s%d" % ( + a.GetSymbol(), a.GetIdx()) + + def test_warn_conjugated_max_iter(self): + smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" + mol = Chem.MolFromSmiles(smi) + with pytest.warns(UserWarning, + match="reasonable number of iterations"): + _rebuild_conjugated_bonds(mol, 2)