From 90fccceb97e57f10f2663ee3280197d214c5f602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 19 Jun 2020 18:16:58 +0200 Subject: [PATCH 01/90] simple converter --- package/MDAnalysis/coordinates/RDKit.py | 81 ++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 3c64241d109..3628735663a 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -59,7 +59,25 @@ import numpy as np from . import memory - +from . import base + +try: + from rdkit import Chem +except ImportError: + pass +else: + RDBONDTYPE = { + 'AROMATIC': Chem.BondType.AROMATIC, + 'SINGLE': Chem.BondType.SINGLE, + 'DOUBLE': Chem.BondType.DOUBLE, + 'TRIPLE': Chem.BondType.TRIPLE, + } + RDBONDORDER = { + 1: Chem.BondType.SINGLE, + 1.5: Chem.BondType.AROMATIC, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } class RDKitReader(memory.MemoryReader): """Coordinate reader for RDKit. @@ -101,4 +119,63 @@ def __init__(self, filename, **kwargs): warnings.warn("No coordinates found in the RDKit molecule") coordinates = np.empty((1,n_atoms,3), dtype=np.float32) coordinates[:] = np.nan - super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) \ No newline at end of file + super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) + + +class RDKitConverter(base.ConverterBase): + """Convert MDAnalysis AtomGroup or Universe to `RDKit `_ :class:`rdkit.Chem.rdchem.Mol`. + + Example + ------- + + .. code-block:: python + + import MDAnalysis as mda + from MDAnalysis.tests.datafiles import PDB_full + u = mda.Universe(PDB_full) + mol = u.select_atoms('resname DMS').convert_to('RDKIT') + + + .. versionadded:: 2.X.X + """ + + lib = 'RDKIT' + units = {'time': None, 'length': 'Angstrom'} + + def convert(self, obj): + """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. + + Parameters + ----------- + obj : AtomGroup or Universe or :class:`Timestep` + """ + try: + from rdkit import Chem + except ImportError: + raise ImportError('RDKit is required for RDKitConverter but ' + 'is not installed. Try installing it with \n' + 'conda install -c conda-forge rdkit') + try: + # make sure to use atoms (Issue 46) + ag_or_ts = obj.atoms + except AttributeError as e: + if isinstance(obj, base.Timestep): + ag_or_ts = obj.copy() + else: + raise TypeError("No Timestep found in obj argument") from e + + mol = Chem.RWMol() + atom_mapper = {} + for atom in ag_or_ts: + rdatom = Chem.Atom(atom.element) + index = mol.AddAtom(rdatom) + atom_mapper[atom.ix] = index + + for bond in ag_or_ts.bonds: + bond_indices = [atom_mapper[i] for i in bond.indices] + bond_type = RDBONDTYPE.get(bond.type.upper(), RDBONDORDER.get( + bond.order, Chem.BondType.SINGLE)) + mol.AddBond(*bond_indices, bond_type) + + Chem.SanitizeMol(mol) + return mol \ No newline at end of file From ba0f89efcf7b1f0d33243a417f97e8980b44848c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 22 Jun 2020 20:22:11 +0200 Subject: [PATCH 02/90] add PDB residue info and guessers --- package/MDAnalysis/coordinates/RDKit.py | 79 +++++++++++++++++++++---- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 3628735663a..74e181f65b1 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -55,9 +55,12 @@ """ import warnings +import re import numpy as np +from ..exceptions import NoDataError +from ..topology.guessers import guess_atom_element from . import memory from . import base @@ -78,6 +81,16 @@ 2: Chem.BondType.DOUBLE, 3: Chem.BondType.TRIPLE, } + RDATTRIBUTES = { + "altLoc": "AltLoc", + "chainID": "ChainId", + "name": "Name", + "occupancy": "Occupancy", + "resname": "ResidueName", + "resid": "ResidueNumber", + "segid": "SegmentNumber", + "tempfactor": "TempFactor", + } class RDKitReader(memory.MemoryReader): """Coordinate reader for RDKit. @@ -147,7 +160,7 @@ def convert(self, obj): Parameters ----------- - obj : AtomGroup or Universe or :class:`Timestep` + obj : AtomGroup or Universe """ try: from rdkit import Chem @@ -157,23 +170,69 @@ def convert(self, obj): 'conda install -c conda-forge rdkit') try: # make sure to use atoms (Issue 46) - ag_or_ts = obj.atoms + ag = obj.atoms except AttributeError as e: - if isinstance(obj, base.Timestep): - ag_or_ts = obj.copy() - else: - raise TypeError("No Timestep found in obj argument") from e + raise TypeError("No `atoms` attribute in object of type {}, " + "please use a valid AtomGroup or Universe".format( + type(obj))) from e mol = Chem.RWMol() atom_mapper = {} - for atom in ag_or_ts: - rdatom = Chem.Atom(atom.element) + for atom in ag: + try: + element = atom.element + except NoDataError: + element = guess_atom_element(atom.name) + rdatom = Chem.Atom(element) + # add properties + mi = Chem.AtomPDBResidueInfo() + for attr, rdattr in RDATTRIBUTES.items(): + try: # get value in MDA atom + value = getattr(atom, attr) + except AttributeError: + pass + else: + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = value.item() + if attr == "segid": + # RDKit needs segid to be an int + try: + value = int(value) + except ValueError: + # convert any string to int + value = int(value, 36) + elif attr == "name": + # RDKit needs the name to be properly formated for a + # PDB file (1 letter elements start at col 14) + name = re.findall('(\D+|\d+)', value) + if len(name) == 2: + symbol, number = name + else: + symbol, number = name[0], "" + value = "{:>2}".format(symbol) + "{:<2}".format(number) + # set attribute value in RDKit MonomerInfo + getattr(mi, "Set%s" % rdattr)(value) + rdatom.SetMonomerInfo(mi) + # TODO other properties (charges) index = mol.AddAtom(rdatom) + # map index in universe to index in mol atom_mapper[atom.ix] = index - for bond in ag_or_ts.bonds: + try: + bonds = ag.bonds + except NoDataError: + ag.guess_bonds() + bonds = ag.bonds + + for bond in bonds: bond_indices = [atom_mapper[i] for i in bond.indices] - bond_type = RDBONDTYPE.get(bond.type.upper(), RDBONDORDER.get( + try: + bond_type = bond.type.upper() + except AttributeError: + # bond type can be a tuple for PDB files + bond_type = None + bond_type = RDBONDTYPE.get(bond_type, RDBONDORDER.get( bond.order, Chem.BondType.SINGLE)) mol.AddBond(*bond_indices, bond_type) From b54ec604265563481192dd61c90c13fb8238fe02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 24 Jun 2020 20:19:21 +0200 Subject: [PATCH 03/90] fix 2 letter atom names and fix bond order --- package/MDAnalysis/coordinates/RDKit.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 74e181f65b1..e226ad39d7a 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -78,9 +78,12 @@ RDBONDORDER = { 1: Chem.BondType.SINGLE, 1.5: Chem.BondType.AROMATIC, + "ar": Chem.BondType.AROMATIC, 2: Chem.BondType.DOUBLE, 3: Chem.BondType.TRIPLE, } + # add string version of the key for each bond + RDBONDORDER.update({str(key):value for key,value in RDBONDORDER.items()}) RDATTRIBUTES = { "altLoc": "AltLoc", "chainID": "ChainId", @@ -178,11 +181,14 @@ def convert(self, obj): mol = Chem.RWMol() atom_mapper = {} + for atom in ag: try: element = atom.element except NoDataError: - element = guess_atom_element(atom.name) + # guess atom element + # capitalize: transform CL to Cl and so on + element = guess_atom_element(atom.name).capitalize() rdatom = Chem.Atom(element) # add properties mi = Chem.AtomPDBResidueInfo() @@ -201,6 +207,7 @@ def convert(self, obj): value = int(value) except ValueError: # convert any string to int + # can be mapped back with np.base_repr(x, 36) value = int(value, 36) elif attr == "name": # RDKit needs the name to be properly formated for a From ac38ceb7ef162befc80292501f591705b71b6423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 24 Jun 2020 20:20:06 +0200 Subject: [PATCH 04/90] test mol2 topology and pdb info --- .../MDAnalysisTests/coordinates/test_rdkit.py | 40 ++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 7bdb845f95a..2a3abe47f64 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -28,7 +28,7 @@ from numpy.testing import (assert_equal, assert_almost_equal) -from MDAnalysisTests.datafiles import mol2_molecule +from MDAnalysisTests.datafiles import mol2_molecule, PDB_full Chem = pytest.importorskip("rdkit.Chem") AllChem = pytest.importorskip("rdkit.Chem.AllChem") @@ -76,3 +76,41 @@ def test_compare_mol2reader(self): assert_equal(universe.trajectory.ts.positions, mol2.trajectory.ts.positions) + +class TestRDKitConverter(object): + @pytest.fixture + def pdb(self): + return mda.Universe(PDB_full) + + @pytest.fixture + def mol2(self): + return mda.Universe(mol2_molecule) + + @pytest.mark.parametrize("sel_str", [ + "resid 1", + "resname LYS and name NZ", + "resid 34 and altloc B", + ]) + def test_monomer_info(self, pdb, sel_str): + rdmol = Chem.MolFromPDBFile(PDB_full) + sel = pdb.select_atoms(sel_str) + umol = sel.convert_to("RDKIT") + atom = umol.GetAtomWithIdx(0) + mi = atom.GetMonomerInfo() + + for mda_attr, rd_attr in mda.coordinates.RDKit.RDATTRIBUTES.items(): + if mda_attr == "occupancy": + mda_attr = "occupancie" + rd_value = getattr(mi, "Get%s" % rd_attr)() + mda_value = getattr(sel, "%ss" % mda_attr)[0] + if mda_attr == "name": + rd_value = rd_value.strip() + elif mda_attr == "segid": + rd_value = np.base_repr(rd_value, 36) + assert rd_value == mda_value + + def test_identical_topology_mol2(self, mol2): + # no chirality check + rdmol = mol2_mol() + umol = mol2.atoms.convert_to("RDKIT") + assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) \ No newline at end of file From de5560bbb744c79d616933760422189aa6806573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 25 Jun 2020 12:28:06 +0200 Subject: [PATCH 05/90] autopep8 --- package/MDAnalysis/coordinates/RDKit.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index e226ad39d7a..128280fdbb9 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -83,7 +83,7 @@ 3: Chem.BondType.TRIPLE, } # add string version of the key for each bond - RDBONDORDER.update({str(key):value for key,value in RDBONDORDER.items()}) + RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) RDATTRIBUTES = { "altLoc": "AltLoc", "chainID": "ChainId", @@ -95,9 +95,10 @@ "tempfactor": "TempFactor", } + class RDKitReader(memory.MemoryReader): """Coordinate reader for RDKit. - + .. versionadded:: 2.0.0 """ format = 'RDKIT' @@ -129,11 +130,11 @@ def __init__(self, filename, **kwargs): """ n_atoms = filename.GetNumAtoms() coordinates = np.array([ - conf.GetPositions() for conf in filename.GetConformers()], + conf.GetPositions() for conf in filename.GetConformers()], dtype=np.float32) if coordinates.size == 0: warnings.warn("No coordinates found in the RDKit molecule") - coordinates = np.empty((1,n_atoms,3), dtype=np.float32) + coordinates = np.empty((1, n_atoms, 3), dtype=np.float32) coordinates[:] = np.nan super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) @@ -150,7 +151,7 @@ class RDKitConverter(base.ConverterBase): from MDAnalysis.tests.datafiles import PDB_full u = mda.Universe(PDB_full) mol = u.select_atoms('resname DMS').convert_to('RDKIT') - + .. versionadded:: 2.X.X """ @@ -177,7 +178,7 @@ def convert(self, obj): except AttributeError as e: raise TypeError("No `atoms` attribute in object of type {}, " "please use a valid AtomGroup or Universe".format( - type(obj))) from e + type(obj))) from e mol = Chem.RWMol() atom_mapper = {} @@ -193,7 +194,7 @@ def convert(self, obj): # add properties mi = Chem.AtomPDBResidueInfo() for attr, rdattr in RDATTRIBUTES.items(): - try: # get value in MDA atom + try: # get value in MDA atom value = getattr(atom, attr) except AttributeError: pass @@ -244,4 +245,4 @@ def convert(self, obj): mol.AddBond(*bond_indices, bond_type) Chem.SanitizeMol(mol) - return mol \ No newline at end of file + return mol From b00dee33b5a75b7b671300027eb1fc089be6c96c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 25 Jun 2020 15:40:51 +0200 Subject: [PATCH 06/90] fix code review --- package/MDAnalysis/coordinates/RDKit.py | 110 +++++++++++++++--------- 1 file changed, 70 insertions(+), 40 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 128280fdbb9..5b6da37556f 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -40,6 +40,8 @@ >>> u.trajectory +>>> u.atoms.convert_to("RDKIT") + Classes @@ -91,7 +93,7 @@ "occupancy": "Occupancy", "resname": "ResidueName", "resid": "ResidueNumber", - "segid": "SegmentNumber", + "segindex": "SegmentNumber", "tempfactor": "TempFactor", } @@ -153,7 +155,7 @@ class RDKitConverter(base.ConverterBase): mol = u.select_atoms('resname DMS').convert_to('RDKIT') - .. versionadded:: 2.X.X + .. versionadded:: 2.0.0 """ lib = 'RDKIT' @@ -175,54 +177,41 @@ def convert(self, obj): try: # make sure to use atoms (Issue 46) ag = obj.atoms - except AttributeError as e: + except AttributeError: raise TypeError("No `atoms` attribute in object of type {}, " "please use a valid AtomGroup or Universe".format( - type(obj))) from e + type(obj))) from None mol = Chem.RWMol() atom_mapper = {} - for atom in ag: - try: - element = atom.element - except NoDataError: - # guess atom element - # capitalize: transform CL to Cl and so on - element = guess_atom_element(atom.name).capitalize() + try: + elements = ag.elements + except NoDataError: + raise AttributeError( + "The `elements` attribute is required for the RDKitConverter " + "but is not present in this AtomGroup. Please refer to the " + "documentation to guess elements from other attributes. " + "If `types` are present in the AtomGroup, a good starting " + "point would be:\n" + ">>> from MDAnalysis.topology.guessers import " + "guess_atom_element\n" + ">>> elements = np.array([" + "guess_atom_element(x).capitalize() for x in u.atoms.types" + "], dtype=object)\n" + ">>> u.add_TopologyAttr('elements', elements)") from None + + for atom, element in zip(ag, elements): + # create atom rdatom = Chem.Atom(element) - # add properties + # add PDB-like properties mi = Chem.AtomPDBResidueInfo() for attr, rdattr in RDATTRIBUTES.items(): - try: # get value in MDA atom - value = getattr(atom, attr) - except AttributeError: - pass - else: - if isinstance(value, np.generic): - # convert numpy types to python standard types - value = value.item() - if attr == "segid": - # RDKit needs segid to be an int - try: - value = int(value) - except ValueError: - # convert any string to int - # can be mapped back with np.base_repr(x, 36) - value = int(value, 36) - elif attr == "name": - # RDKit needs the name to be properly formated for a - # PDB file (1 letter elements start at col 14) - name = re.findall('(\D+|\d+)', value) - if len(name) == 2: - symbol, number = name - else: - symbol, number = name[0], "" - value = "{:>2}".format(symbol) + "{:<2}".format(number) - # set attribute value in RDKit MonomerInfo - getattr(mi, "Set%s" % rdattr)(value) + _add_mda_attr_to_rdkit(atom, attr, rdattr, mi) rdatom.SetMonomerInfo(mi) - # TODO other properties (charges) + # other properties + # TODO add bfactors, charges, icodes, segids, types + # add atom index = mol.AddAtom(rdatom) # map index in universe to index in mol atom_mapper[atom.ix] = index @@ -230,6 +219,9 @@ def convert(self, obj): try: bonds = ag.bonds except NoDataError: + warnings.warn( + "No `bonds` attribute in this AtomGroup. Guessing bonds based" + "on atoms coordinates") ag.guess_bonds() bonds = ag.bonds @@ -246,3 +238,41 @@ def convert(self, obj): Chem.SanitizeMol(mol) return mol + + +def _add_mda_attr_to_rdkit(atom, attr, rdattr, mi): + """Converts an MDAnalysis atom attribute into the RDKit equivalent and + stores it into an RDKit AtomPDBResidueInfo object. + + Parameters + ---------- + + atom : MDAnalysis.core.groups.Atom + The atom to get the attributes from + attr : str + Name of the atom attribute in MDAnalysis in the singular form + rdattr : str + Name of the equivalent attribute in RDKit, as found in the `Set` and + `Get` methods of the `AtomPDBResidueInfo` + mi : rdkit.Chem.rdchem.AtomPDBResidueInfo + MonomerInfo object containing all the relevant atom attributes + """ + try: # get value in MDA atom + value = getattr(atom, attr) + except AttributeError: + pass + else: + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = value.item() + if attr == "name": + # RDKit needs the name to be properly formated for a + # PDB file (1 letter elements start at col 14) + name = re.findall('(\D+|\d+)', value) + if len(name) == 2: + symbol, number = name + else: + symbol, number = name[0], "" + value = "{:>2}".format(symbol) + "{:<2}".format(number) + # set attribute value in RDKit MonomerInfo + getattr(mi, "Set%s" % rdattr)(value) From 495217d17ccc23f53dd2a5f196bc514cdbaf20ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 25 Jun 2020 18:28:31 +0200 Subject: [PATCH 07/90] store other attributes in each atom --- package/MDAnalysis/coordinates/RDKit.py | 33 +++++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 5b6da37556f..d14f091bf1c 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -171,9 +171,9 @@ def convert(self, obj): try: from rdkit import Chem except ImportError: - raise ImportError('RDKit is required for RDKitConverter but ' - 'is not installed. Try installing it with \n' - 'conda install -c conda-forge rdkit') + raise ImportError("RDKit is required for the RDKitConverter but " + "it's not installed. Try installing it with \n" + "conda install -c conda-forge rdkit") try: # make sure to use atoms (Issue 46) ag = obj.atoms @@ -182,9 +182,6 @@ def convert(self, obj): "please use a valid AtomGroup or Universe".format( type(obj))) from None - mol = Chem.RWMol() - atom_mapper = {} - try: elements = ag.elements except NoDataError: @@ -201,16 +198,31 @@ def convert(self, obj): "], dtype=object)\n" ">>> u.add_TopologyAttr('elements', elements)") from None - for atom, element in zip(ag, elements): + other_attrs = {} + for attr in ["bfactors", "charges", "icodes", "segids", "types"]: + if hasattr(ag, attr): + other_attrs[attr] = getattr(ag, attr) + + mol = Chem.RWMol() + atom_mapper = {} + + for i, (atom, element) in enumerate(zip(ag, elements)): # create atom rdatom = Chem.Atom(element) + # disable adding H to the molecule + rdatom.SetNoImplicit(True) # add PDB-like properties mi = Chem.AtomPDBResidueInfo() for attr, rdattr in RDATTRIBUTES.items(): _add_mda_attr_to_rdkit(atom, attr, rdattr, mi) rdatom.SetMonomerInfo(mi) # other properties - # TODO add bfactors, charges, icodes, segids, types + for attr in other_attrs.keys(): + value = other_attrs[attr][i] + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = str(value) + rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value) # add atom index = mol.AddAtom(rdatom) # map index in universe to index in mol @@ -218,6 +230,9 @@ def convert(self, obj): try: bonds = ag.bonds + if (len(bonds) == 0) and (ag.n_atoms > 1): + # force guessing bonds + raise NoDataError except NoDataError: warnings.warn( "No `bonds` attribute in this AtomGroup. Guessing bonds based" @@ -236,7 +251,9 @@ def convert(self, obj): bond.order, Chem.BondType.SINGLE)) mol.AddBond(*bond_indices, bond_type) + # sanitization Chem.SanitizeMol(mol) + return mol From eb5d3658caacf0670dcfc04162255a10989bd3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 25 Jun 2020 18:29:55 +0200 Subject: [PATCH 08/90] more tests on topology + pep8 --- .../MDAnalysisTests/coordinates/test_rdkit.py | 91 ++++++++++++++++--- 1 file changed, 78 insertions(+), 13 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 2a3abe47f64..46d74e1c2c4 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -24,24 +24,34 @@ import pytest import MDAnalysis as mda +from MDAnalysis.topology.guessers import guess_atom_element import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) from MDAnalysisTests.datafiles import mol2_molecule, PDB_full -Chem = pytest.importorskip("rdkit.Chem") -AllChem = pytest.importorskip("rdkit.Chem.AllChem") +try: + from rdkit import Chem + from rdkit.Chem import AllChem +except ImportError: + rdkit_installed = False +else: + rdkit_installed = True + def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) + def smiles_mol(): - mol = Chem.MolFromSmiles("CCO") + mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") mol = Chem.AddHs(mol) cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) return mol + +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ (mol2_mol(), 1), @@ -51,7 +61,7 @@ def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames expected = np.array([ - conf.GetPositions() for conf in rdmol.GetConformers()], + conf.GetPositions() for conf in rdmol.GetConformers()], dtype=np.float32) assert_equal(expected, universe.trajectory.coordinate_array) @@ -65,7 +75,7 @@ def test_no_coordinates(self): assert len(w) == 1 assert "No coordinates found" in str( w[-1].message) - expected = np.empty((1,u.atoms.n_atoms,3), dtype=np.float32) + expected = np.empty((1, u.atoms.n_atoms, 3), dtype=np.float32) expected[:] = np.nan assert_equal(u.trajectory.coordinate_array, expected) @@ -73,10 +83,11 @@ def test_compare_mol2reader(self): universe = mda.Universe(mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames - assert_equal(universe.trajectory.ts.positions, + assert_equal(universe.trajectory.ts.positions, mol2.trajectory.ts.positions) - + +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") class TestRDKitConverter(object): @pytest.fixture def pdb(self): @@ -84,7 +95,13 @@ def pdb(self): @pytest.fixture def mol2(self): - return mda.Universe(mol2_molecule) + u = mda.Universe(mol2_molecule) + # add elements + elements = np.array([ + guess_atom_element(x).capitalize() for x in u.atoms.types + ], dtype=object) + u.add_TopologyAttr('elements', elements) + return u @pytest.mark.parametrize("sel_str", [ "resid 1", @@ -97,20 +114,68 @@ def test_monomer_info(self, pdb, sel_str): umol = sel.convert_to("RDKIT") atom = umol.GetAtomWithIdx(0) mi = atom.GetMonomerInfo() - + for mda_attr, rd_attr in mda.coordinates.RDKit.RDATTRIBUTES.items(): if mda_attr == "occupancy": mda_attr = "occupancie" + elif mda_attr == "segindex": + mda_attr = "segindice" rd_value = getattr(mi, "Get%s" % rd_attr)() mda_value = getattr(sel, "%ss" % mda_attr)[0] if mda_attr == "name": rd_value = rd_value.strip() - elif mda_attr == "segid": - rd_value = np.base_repr(rd_value, 36) assert rd_value == mda_value def test_identical_topology_mol2(self, mol2): - # no chirality check + """Check stereochemistry on atoms and bonds (but not yet)""" rdmol = mol2_mol() umol = mol2.atoms.convert_to("RDKIT") - assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) \ No newline at end of file + assert rdmol.HasSubstructMatch(umol, useChirality=False) + assert umol.HasSubstructMatch(rdmol, useChirality=False) + + def test_identical_topology(self): + rdmol = smiles_mol() + u = mda.Universe(rdmol) + umol = u.atoms.convert_to("RDKIT") + assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) + + def test_raise_requires_elements(self): + u = mda.Universe(mol2_molecule) + with pytest.raises(AttributeError) as e: + u.atoms.convert_to("RDKIT") + assert "`elements` attribute is required for the RDKitConverter" in str( + e.value) + + def test_warn_guess_bonds(self, pdb): + pdb.delete_bonds(pdb.bonds) + ag = pdb.select_atoms("resnum 101 and segid A") + pdb.delete_bonds(ag.bonds) + with warnings.catch_warnings(record=True) as w: + # Cause all warnings to always be triggered. + warnings.simplefilter("always") + # trigger warning + ag.convert_to("RDKIT") + assert len(w) == 1 + assert "No `bonds` attribute in this AtomGroup" in str( + w[-1].message) + + @pytest.mark.parametrize("idx", [0, 10, 42]) + def test_other_attributes(self, mol2, idx): + mol = mol2.atoms.convert_to("RDKIT") + rdprops = mol.GetAtomWithIdx(idx).GetPropsAsDict() + for prop in ["charge", "segid", "type"]: + rdprop = rdprops["_MDAnalysis_%s" % prop] + mdaprop = getattr(mol2.atoms[idx], prop) + assert rdprop == mdaprop + + +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + if rdkit_installed: + pass + else: + u = mda.Universe(mol2_molecule) + with pytest.raises(ImportError) as e: + u.atoms.convert_to("RDKIT") + assert "RDKit is required for the RDKitConverter" in str( + e.value) From 177496396788e9b806a43d93c8e34e1e686fec44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 26 Jun 2020 19:49:19 +0200 Subject: [PATCH 09/90] fix other props type --- package/MDAnalysis/coordinates/RDKit.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index d14f091bf1c..486e5719250 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -96,6 +96,7 @@ "segindex": "SegmentNumber", "tempfactor": "TempFactor", } + PERIODIC_TABLE = Chem.GetPeriodicTable() class RDKitReader(memory.MemoryReader): @@ -188,15 +189,8 @@ def convert(self, obj): raise AttributeError( "The `elements` attribute is required for the RDKitConverter " "but is not present in this AtomGroup. Please refer to the " - "documentation to guess elements from other attributes. " - "If `types` are present in the AtomGroup, a good starting " - "point would be:\n" - ">>> from MDAnalysis.topology.guessers import " - "guess_atom_element\n" - ">>> elements = np.array([" - "guess_atom_element(x).capitalize() for x in u.atoms.types" - "], dtype=object)\n" - ">>> u.add_TopologyAttr('elements', elements)") from None + "documentation to guess elements from other attributes or " + "type `help(mda.topology.guessers)`") from None other_attrs = {} for attr in ["bfactors", "charges", "icodes", "segids", "types"]: @@ -219,9 +213,12 @@ def convert(self, obj): # other properties for attr in other_attrs.keys(): value = other_attrs[attr][i] - if isinstance(value, np.generic): - # convert numpy types to python standard types - value = str(value) + if isinstance(value, np.float): + rdatom.SetDoubleProp("_MDAnalysis_%s" % attr[:-1], + float(value)) + elif isinstance(value, np.int): + rdatom.SetIntProp("_MDAnalysis_%s" % attr[:-1], int(value)) + else: rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value) # add atom index = mol.AddAtom(rdatom) From 81eea81c0becd7160e12fb9312b64c1c47580230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 26 Jun 2020 19:55:29 +0200 Subject: [PATCH 10/90] bond order and charges --- package/MDAnalysis/coordinates/RDKit.py | 115 +++++++++++++++++- .../MDAnalysisTests/coordinates/test_rdkit.py | 34 ++++++ 2 files changed, 146 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 486e5719250..0eeb43a7f69 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -219,7 +219,7 @@ def convert(self, obj): elif isinstance(value, np.int): rdatom.SetIntProp("_MDAnalysis_%s" % attr[:-1], int(value)) else: - rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value) + rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value) # add atom index = mol.AddAtom(rdatom) # map index in universe to index in mol @@ -237,8 +237,19 @@ def convert(self, obj): ag.guess_bonds() bonds = ag.bonds + border_atom_indices = [] for bond in bonds: - bond_indices = [atom_mapper[i] for i in bond.indices] + try: + bond_indices = [atom_mapper[i] for i in bond.indices] + except KeyError: + # one of the atoms of the bond is not part of the atomgroup + # save the bond atom that is in the atomgroup for later + for i in bond.indices: + if i in atom_mapper.keys(): + border_atom_indices.append(atom_mapper[i]) + break + # skip the rest + continue try: bond_type = bond.type.upper() except AttributeError: @@ -248,7 +259,12 @@ def convert(self, obj): bond.order, Chem.BondType.SINGLE)) mol.AddBond(*bond_indices, bond_type) - # sanitization + mol.UpdatePropertyCache(strict=False) + + # infer bond orders and formal charges from the connectivity + _infer_bo_and_charges(mol, border_atom_indices) + + # sanitize Chem.SanitizeMol(mol) return mol @@ -290,3 +306,96 @@ def _add_mda_attr_to_rdkit(atom, attr, rdattr, mi): value = "{:>2}".format(symbol) + "{:<2}".format(number) # set attribute value in RDKit MonomerInfo getattr(mi, "Set%s" % rdattr)(value) + + +def _infer_bo_and_charges(mol, border_atom_indices): + """Infer bond orders and formal charges from a molecule. + + - Step 1 + Since most MD topology files don't explicitely retain informations on bond + orders or charges, it has to be guessed from the topology. This is done by + looping other each atom and comparing its expected valence to the current + valence, called `delta_v`. If two neighbouring atoms have a common + positive delta_v, the bond between them most likely has a bond order of + 1+delta_v. If an atom doesn't share a delta_v with any of its neighbours, + it likely needs a formal charge of -delta_v. + + - Step 2 + Some atoms can be "mutilated" by a selection (i.e. one of their bonds is cut). The previous step is likely to assign a formal charge to such atoms even if they weren't charged in the original topology. This step converts the resulting charges to radical electrons, or in some cases to higher order bonds. This ensures the atomgroup is not artificially charged because of the previous step. + + Parameters + ---------- + + mol : rdkit.Chem.rdchem.RWMol + The molecule is modified inplace and must have all hydrogens added + + border_atom_indices : list + List of border atoms indices + """ + # Step 1 + for atom in mol.GetAtoms(): + # create delta_v for each possible valence + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() + delta_vs = [expected_v - current_v for expected_v in expected_vs] + + # if there's only one possible valence state and the correpsonding + # delta_v is negative, it means we can only add a positive charge to + # the atom + if (len(delta_vs) == 1) and (delta_vs[0] < 0): + charge = -delta_vs[0] + atom.SetFormalCharge(charge) + mol.UpdatePropertyCache(strict=False) + else: + neighbors = atom.GetNeighbors() + # check if one of the neighbors has a common delta_v + for i, na in enumerate(neighbors, start=1): + # create delta_v for the neighbor + na_expected_vs = PERIODIC_TABLE.GetValenceList( + na.GetAtomicNum()) + na_current = na.GetTotalValence() + na_delta = [ + na_expected - na_current for na_expected in na_expected_vs] + # smallest common delta_v, else NaN + common_delta = min(set(delta_vs).intersection(na_delta), + default=np.nan) + # common_delta == 0 means we don't need to do anything + if common_delta != 0: + # if they have no delta_v in common + if common_delta is np.nan: + # if it's the last neighbor + if i == len(neighbors): + charge = -delta_vs[0] # negative + atom.SetFormalCharge(charge) + mol.UpdatePropertyCache(strict=False) + # if they both need a supplementary bond + else: + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx()) + bond.SetBondType(RDBONDORDER[common_delta+1]) + mol.UpdatePropertyCache(strict=False) + break # out of neighbors loop + + # Step 2 + for i in border_atom_indices: + atom = mol.GetAtomWithIdx(i) + charge = atom.GetFormalCharge() + neighbors = atom.GetNeighbors() + # check if a neighbor atom also bears a charge + for i, na in enumerate(neighbors, 1): + na_charge = na.GetFormalCharge() + if na_charge < 0: + # both atoms have a negative charge + # convert to higher order bond + common_delta = max([charge, na_charge]) + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), na.GetIdx()) + bond.SetBondType(RDBONDORDER[-common_delta+1]) + na.SetFormalCharge(na_charge - common_delta) + atom.SetFormalCharge(0) + atom.SetNumRadicalElectrons(common_delta - charge) + break + elif i == len(neighbors): + # no neighbor shares a negative charge + atom.SetNumRadicalElectrons(-atom.GetFormalCharge()) + atom.SetFormalCharge(0) + mol.UpdatePropertyCache(strict=False) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 46d74e1c2c4..3aac8b8c2f4 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -31,6 +31,8 @@ from MDAnalysisTests.datafiles import mol2_molecule, PDB_full +from MDAnalysis.coordinates.RDKit import _infer_bo_and_charges + try: from rdkit import Chem from rdkit.Chem import AllChem @@ -179,3 +181,35 @@ def test_converter_requires_rdkit(self): u.atoms.convert_to("RDKIT") assert "RDKit is required for the RDKitConverter" in str( e.value) + + +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +class TestRDKitFunctions(object): + @pytest.mark.parametrize("smi, out", [ + ("[H]C([H])([H])[H]", "C"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", "c1ccccc1"), + ("[Cl]-[C](-[H])-[O]", "C(=O)Cl"), + ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), + ("[P](-O)(-O)(-O)-[O]", "P(O)(O)(O)=O"), + ("[N]-[C]-[H]", "N#C"), + ]) + def test_infer_bond_orders(self, smi, out): + molin = Chem.MolFromSmiles(smi, sanitize=False) + molin = _infer_bo_and_charges(molin) + molin = Chem.RemoveHs(molin) + molref = Chem.MolFromSmiles(out) + assert molin.HasSubstructMatch( + molref) and molref.HasSubstructMatch(molin) + + @pytest.mark.parametrize("smi, atom, charge", [ + ("C-[O]", "O", -1), + ("[N]-[C]-[O]", "O", -1), + ("[N](-[H])(-[H])(-[H])-[H]", "N", 1), + ]) + def test_infer_charges(self, smi, atom, charge): + mol = Chem.MolFromSmiles(smi, sanitize=False) + mol = _infer_bo_and_charges(mol) + index = mol.GetSubstructMatch(Chem.MolFromSmarts(atom))[0] + atom = mol.GetAtomWithIdx(index) + assert atom.GetFormalCharge() == charge From 6d6a65beddf5d5d100d8bc259631c493b5b9a05f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 26 Jun 2020 20:00:34 +0200 Subject: [PATCH 11/90] fix indentation and bonds with atoms outside of atomgroup --- package/MDAnalysis/coordinates/RDKit.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 486e5719250..d8c03aef9b9 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -213,13 +213,13 @@ def convert(self, obj): # other properties for attr in other_attrs.keys(): value = other_attrs[attr][i] + attr = attr[:-1] if isinstance(value, np.float): - rdatom.SetDoubleProp("_MDAnalysis_%s" % attr[:-1], - float(value)) + rdatom.SetDoubleProp("_MDAnalysis_%s" % attr, float(value)) elif isinstance(value, np.int): - rdatom.SetIntProp("_MDAnalysis_%s" % attr[:-1], int(value)) + rdatom.SetIntProp("_MDAnalysis_%s" % attr, int(value)) else: - rdatom.SetProp("_MDAnalysis_%s" % attr[:-1], value) + rdatom.SetProp("_MDAnalysis_%s" % attr, value) # add atom index = mol.AddAtom(rdatom) # map index in universe to index in mol @@ -238,7 +238,11 @@ def convert(self, obj): bonds = ag.bonds for bond in bonds: - bond_indices = [atom_mapper[i] for i in bond.indices] + try: + bond_indices = [atom_mapper[i] for i in bond.indices] + except KeyError: + # one of the atoms of the bond is not part of the atomgroup + continue try: bond_type = bond.type.upper() except AttributeError: From cf0706c385d72112647cc2a8bab4f464926933ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 26 Jun 2020 20:02:12 +0200 Subject: [PATCH 12/90] fix test --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 46d74e1c2c4..6257c5d3405 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -40,10 +40,12 @@ rdkit_installed = True +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) +@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") def smiles_mol(): mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") mol = Chem.AddHs(mol) From d41fa5bd5ec2df356331a912ea81d73331ed2fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 14:55:44 +0200 Subject: [PATCH 13/90] fix test minimal build --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 6257c5d3405..d8127ba5659 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -40,12 +40,12 @@ rdkit_installed = True -@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) -@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") def smiles_mol(): mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") mol = Chem.AddHs(mol) @@ -53,7 +53,7 @@ def smiles_mol(): return mol -@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ (mol2_mol(), 1), @@ -89,7 +89,7 @@ def test_compare_mol2reader(self): mol2.trajectory.ts.positions) -@pytest.mark.skipif(rdkit_installed is False, reason="requires RDKit") +@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") class TestRDKitConverter(object): @pytest.fixture def pdb(self): From fe27f0bbe23e06f3b6c979090da489d471ea0c49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 14:56:33 +0200 Subject: [PATCH 14/90] add icodes to MonomerInfo --- package/MDAnalysis/coordinates/RDKit.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index d8c03aef9b9..f7e93ef3c52 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -89,6 +89,7 @@ RDATTRIBUTES = { "altLoc": "AltLoc", "chainID": "ChainId", + "icode": "InsertionCode", "name": "Name", "occupancy": "Occupancy", "resname": "ResidueName", @@ -193,7 +194,7 @@ def convert(self, obj): "type `help(mda.topology.guessers)`") from None other_attrs = {} - for attr in ["bfactors", "charges", "icodes", "segids", "types"]: + for attr in ["bfactors", "charges", "segids", "types"]: if hasattr(ag, attr): other_attrs[attr] = getattr(ag, attr) @@ -213,7 +214,7 @@ def convert(self, obj): # other properties for attr in other_attrs.keys(): value = other_attrs[attr][i] - attr = attr[:-1] + attr = attr[:-1] # plural to singular if isinstance(value, np.float): rdatom.SetDoubleProp("_MDAnalysis_%s" % attr, float(value)) elif isinstance(value, np.int): @@ -273,7 +274,7 @@ def _add_mda_attr_to_rdkit(atom, attr, rdattr, mi): Name of the equivalent attribute in RDKit, as found in the `Set` and `Get` methods of the `AtomPDBResidueInfo` mi : rdkit.Chem.rdchem.AtomPDBResidueInfo - MonomerInfo object containing all the relevant atom attributes + MonomerInfo object that will store the relevant atom attributes """ try: # get value in MDA atom value = getattr(atom, attr) From 3590113cc194e6a9b6b98f8d7ae4367e1726e7fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 16:07:23 +0200 Subject: [PATCH 15/90] better attributes conversion --- package/MDAnalysis/coordinates/RDKit.py | 71 +++++++++---------- .../MDAnalysisTests/coordinates/test_rdkit.py | 20 +++--- 2 files changed, 43 insertions(+), 48 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index f7e93ef3c52..abba21adaf7 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -87,15 +87,15 @@ # add string version of the key for each bond RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) RDATTRIBUTES = { - "altLoc": "AltLoc", - "chainID": "ChainId", - "icode": "InsertionCode", - "name": "Name", - "occupancy": "Occupancy", - "resname": "ResidueName", - "resid": "ResidueNumber", - "segindex": "SegmentNumber", - "tempfactor": "TempFactor", + "altLocs": "AltLoc", + "chainIDs": "ChainId", + "icodes": "InsertionCode", + "names": "Name", + "occupancies": "Occupancy", + "resnames": "ResidueName", + "resids": "ResidueNumber", + "segindices": "SegmentNumber", + "tempfactors": "TempFactor", } PERIODIC_TABLE = Chem.GetPeriodicTable() @@ -193,6 +193,12 @@ def convert(self, obj): "documentation to guess elements from other attributes or " "type `help(mda.topology.guessers)`") from None + # attributes accepted in PDBResidueInfo object + pdb_attrs = {} + for attr in RDATTRIBUTES.keys(): + if hasattr(ag, attr): + pdb_attrs[attr] = getattr(ag, attr) + # others other_attrs = {} for attr in ["bfactors", "charges", "segids", "types"]: if hasattr(ag, attr): @@ -208,8 +214,8 @@ def convert(self, obj): rdatom.SetNoImplicit(True) # add PDB-like properties mi = Chem.AtomPDBResidueInfo() - for attr, rdattr in RDATTRIBUTES.items(): - _add_mda_attr_to_rdkit(atom, attr, rdattr, mi) + for attr, values in pdb_attrs.items(): + _add_mda_attr_to_rdkit(attr, values[i], mi) rdatom.SetMonomerInfo(mi) # other properties for attr in other_attrs.keys(): @@ -259,39 +265,32 @@ def convert(self, obj): return mol -def _add_mda_attr_to_rdkit(atom, attr, rdattr, mi): +def _add_mda_attr_to_rdkit(attr, value, mi): """Converts an MDAnalysis atom attribute into the RDKit equivalent and stores it into an RDKit AtomPDBResidueInfo object. Parameters ---------- - atom : MDAnalysis.core.groups.Atom - The atom to get the attributes from attr : str Name of the atom attribute in MDAnalysis in the singular form - rdattr : str - Name of the equivalent attribute in RDKit, as found in the `Set` and - `Get` methods of the `AtomPDBResidueInfo` + value : object, np.int or np.float + Attribute value as found in the AtomGroup mi : rdkit.Chem.rdchem.AtomPDBResidueInfo MonomerInfo object that will store the relevant atom attributes """ - try: # get value in MDA atom - value = getattr(atom, attr) - except AttributeError: - pass - else: - if isinstance(value, np.generic): - # convert numpy types to python standard types - value = value.item() - if attr == "name": - # RDKit needs the name to be properly formated for a - # PDB file (1 letter elements start at col 14) - name = re.findall('(\D+|\d+)', value) - if len(name) == 2: - symbol, number = name - else: - symbol, number = name[0], "" - value = "{:>2}".format(symbol) + "{:<2}".format(number) - # set attribute value in RDKit MonomerInfo - getattr(mi, "Set%s" % rdattr)(value) + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = value.item() + if attr == "name": + # RDKit needs the name to be properly formated for a + # PDB file (1 letter elements start at col 14) + name = re.findall('(\D+|\d+)', value) + if len(name) == 2: + symbol, number = name + else: + symbol, number = name[0], "" + value = "{:>2}".format(symbol) + "{:<2}".format(number) + # set attribute value in RDKit MonomerInfo + rdattr = RDATTRIBUTES[attr] + getattr(mi, "Set%s" % rdattr)(value) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index d8127ba5659..67cfb7c9af9 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -105,26 +105,22 @@ def mol2(self): u.add_TopologyAttr('elements', elements) return u - @pytest.mark.parametrize("sel_str", [ - "resid 1", - "resname LYS and name NZ", - "resid 34 and altloc B", + @pytest.mark.parametrize("sel_str, atom_index", [ + ("resid 1", 0), + ("resname LYS and name NZ", 1), + ("resid 34 and altloc B", 2), ]) - def test_monomer_info(self, pdb, sel_str): + def test_monomer_info(self, pdb, sel_str, atom_index): rdmol = Chem.MolFromPDBFile(PDB_full) sel = pdb.select_atoms(sel_str) umol = sel.convert_to("RDKIT") - atom = umol.GetAtomWithIdx(0) + atom = umol.GetAtomWithIdx(atom_index) mi = atom.GetMonomerInfo() for mda_attr, rd_attr in mda.coordinates.RDKit.RDATTRIBUTES.items(): - if mda_attr == "occupancy": - mda_attr = "occupancie" - elif mda_attr == "segindex": - mda_attr = "segindice" rd_value = getattr(mi, "Get%s" % rd_attr)() - mda_value = getattr(sel, "%ss" % mda_attr)[0] - if mda_attr == "name": + mda_value = getattr(sel, "%s" % mda_attr)[atom_index] + if mda_attr == "names": rd_value = rd_value.strip() assert rd_value == mda_value From 2d79014dfe5c36ee38647618d7bbc825125ad69a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 17:37:30 +0200 Subject: [PATCH 16/90] test mda to rdkit MonomerInfo conversion --- package/MDAnalysis/coordinates/RDKit.py | 4 ++-- .../MDAnalysisTests/coordinates/test_rdkit.py | 22 ++++++++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index abba21adaf7..9302f7cd247 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -282,7 +282,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): if isinstance(value, np.generic): # convert numpy types to python standard types value = value.item() - if attr == "name": + if attr == "names": # RDKit needs the name to be properly formated for a # PDB file (1 letter elements start at col 14) name = re.findall('(\D+|\d+)', value) @@ -290,7 +290,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): symbol, number = name else: symbol, number = name[0], "" - value = "{:>2}".format(symbol) + "{:<2}".format(number) + value = "{:>2}{:<2}".format(symbol, number) # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] getattr(mi, "Set%s" % rdattr)(value) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 67cfb7c9af9..0ac29f9299c 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -25,6 +25,7 @@ import pytest import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element +from MDAnalysis.coordinates.RDKit import RDATTRIBUTES, _add_mda_attr_to_rdkit import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) @@ -117,7 +118,7 @@ def test_monomer_info(self, pdb, sel_str, atom_index): atom = umol.GetAtomWithIdx(atom_index) mi = atom.GetMonomerInfo() - for mda_attr, rd_attr in mda.coordinates.RDKit.RDATTRIBUTES.items(): + for mda_attr, rd_attr in RDATTRIBUTES.items(): rd_value = getattr(mi, "Get%s" % rd_attr)() mda_value = getattr(sel, "%s" % mda_attr)[atom_index] if mda_attr == "names": @@ -157,6 +158,25 @@ def test_warn_guess_bonds(self, pdb): assert "No `bonds` attribute in this AtomGroup" in str( w[-1].message) + @pytest.mark.parametrize("attr, value, expected", [ + ("names", "C1", " C1 "), + ("names", "C12", " C12"), + ("names", "Cl1", "Cl1 "), + ("altLocs", "A", "A"), + ("chainIDs", "B", "B"), + ("icodes", "C", "C"), + ("occupancies", 0.5, 0.5), + ("resnames", "LIG", "LIG"), + ("resids", 123, 123), + ("segindices", 1, 1), + ("tempfactors", 0.8, 0.8), + ]) + def test_add_mda_attr_to_rdkit(self, attr, value, expected): + mi = Chem.AtomPDBResidueInfo() + _add_mda_attr_to_rdkit(attr, value, mi) + rdvalue = getattr(mi, "Get%s" % RDATTRIBUTES[attr])() + assert rdvalue == expected + @pytest.mark.parametrize("idx", [0, 10, 42]) def test_other_attributes(self, mol2, idx): mol = mol2.atoms.convert_to("RDKIT") From 68dc7ca94e277f00e6a0e4e66235a145faa710a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 17:53:41 +0200 Subject: [PATCH 17/90] test mda to rdkit MonomerInfo conversion --- package/MDAnalysis/coordinates/RDKit.py | 34 +++++++++++-------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 699dbed344e..7196a8a0078 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -291,25 +291,21 @@ def _add_mda_attr_to_rdkit(attr, value, mi): mi : rdkit.Chem.rdchem.AtomPDBResidueInfo MonomerInfo object that will store the relevant atom attributes """ - try: # get value in MDA atom - value = getattr(atom, attr) - except AttributeError: - pass - else: - if isinstance(value, np.generic): - # convert numpy types to python standard types - value = value.item() - if attr == "name": - # RDKit needs the name to be properly formated for a - # PDB file (1 letter elements start at col 14) - name = re.findall('(\D+|\d+)', value) - if len(name) == 2: - symbol, number = name - else: - symbol, number = name[0], "" - value = "{:>2}".format(symbol) + "{:<2}".format(number) - # set attribute value in RDKit MonomerInfo - getattr(mi, "Set%s" % rdattr)(value) + if isinstance(value, np.generic): + # convert numpy types to python standard types + value = value.item() + if attr == "names": + # RDKit needs the name to be properly formated for a + # PDB file (1 letter elements start at col 14) + name = re.findall('(\D+|\d+)', value) + if len(name) == 2: + symbol, number = name + else: + symbol, number = name[0], "" + value = "{:>2}{:<2}".format(symbol, number) + # set attribute value in RDKit MonomerInfo + rdattr = RDATTRIBUTES[attr] + getattr(mi, "Set%s" % rdattr)(value) def _infer_bo_and_charges(mol, border_atom_indices): From f57e9e632f5b17138da61bc3c0026cae93d7ef29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 18:17:34 +0200 Subject: [PATCH 18/90] docs --- package/MDAnalysis/coordinates/RDKit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 7196a8a0078..418c846f156 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -321,7 +321,7 @@ def _infer_bo_and_charges(mol, border_atom_indices): it likely needs a formal charge of -delta_v. - Step 2 - Some atoms can be "mutilated" by a selection (i.e. one of their bonds is cut). The previous step is likely to assign a formal charge to such atoms even if they weren't charged in the original topology. This step converts the resulting charges to radical electrons, or in some cases to higher order bonds. This ensures the atomgroup is not artificially charged because of the previous step. + Some atoms can be "mutilated" by a selection (i.e. one of their bonds is cut). The previous step is likely to assign a formal charge to such atoms even if they weren't charged in the original topology. This step converts the resulting charges to higher order bonds when possible, or to radical electrons. This ensures the atomgroup is not artificially charged because of the previous step. Parameters ---------- From 99da261527f90bda06e9a5c9827742b1ae56a038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 29 Jun 2020 18:23:30 +0200 Subject: [PATCH 19/90] fix minimal deps test --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 0ac29f9299c..e66162f33a1 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -25,7 +25,6 @@ import pytest import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element -from MDAnalysis.coordinates.RDKit import RDATTRIBUTES, _add_mda_attr_to_rdkit import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) @@ -35,6 +34,8 @@ try: from rdkit import Chem from rdkit.Chem import AllChem + from MDAnalysis.coordinates.RDKit import ( + RDATTRIBUTES, _add_mda_attr_to_rdkit) except ImportError: rdkit_installed = False else: From 18a05aba3a4013bc913edd6912ce5b611585e7b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 30 Jun 2020 17:31:52 +0200 Subject: [PATCH 20/90] =?UTF-8?q?fix=20for=20minimal=20deps=20tests=20?= =?UTF-8?q?=F0=9F=99=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../MDAnalysisTests/coordinates/test_rdkit.py | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index e66162f33a1..a4dfec5c137 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -38,24 +38,30 @@ RDATTRIBUTES, _add_mda_attr_to_rdkit) except ImportError: rdkit_installed = False + + def mol2_mol(): + pass + + def smiles_mol(): + pass else: rdkit_installed = True + def mol2_mol(): + return Chem.MolFromMol2File(mol2_molecule, removeHs=False) -@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") -def mol2_mol(): - return Chem.MolFromMol2File(mol2_molecule, removeHs=False) + def smiles_mol(): + mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") + mol = Chem.AddHs(mol) + cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) + return mol -@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") -def smiles_mol(): - mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") - mol = Chem.AddHs(mol) - cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) - return mol +requires_rdkit = pytest.mark.skipif(rdkit_installed == False, + reason="requires RDKit") -@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") +@requires_rdkit class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ (mol2_mol(), 1), @@ -91,7 +97,7 @@ def test_compare_mol2reader(self): mol2.trajectory.ts.positions) -@pytest.mark.skipif(rdkit_installed == False, reason="requires RDKit") +@requires_rdkit class TestRDKitConverter(object): @pytest.fixture def pdb(self): @@ -188,13 +194,10 @@ def test_other_attributes(self, mol2, idx): assert rdprop == mdaprop +@pytest.mark.skipif(rdkit_installed == True, reason="test minimal dependency") class TestRequiresRDKit(object): def test_converter_requires_rdkit(self): - if rdkit_installed: - pass - else: - u = mda.Universe(mol2_molecule) - with pytest.raises(ImportError) as e: - u.atoms.convert_to("RDKIT") - assert "RDKit is required for the RDKitConverter" in str( - e.value) + u = mda.Universe(mol2_molecule) + with pytest.raises(ImportError) as e: + u.atoms.convert_to("RDKIT") + assert "RDKit is required for the RDKitConverter" in str(e.value) From 1c14478bb41cfd5cb774e9b52b4520ea50a898cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 30 Jun 2020 18:09:27 +0200 Subject: [PATCH 21/90] use util functions --- .../MDAnalysisTests/coordinates/test_rdkit.py | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index a4dfec5c137..35a85ad81de 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -30,6 +30,7 @@ assert_almost_equal) from MDAnalysisTests.datafiles import mol2_molecule, PDB_full +from MDAnalysisTests.util import block_import, import_not_available try: from rdkit import Chem @@ -37,16 +38,12 @@ from MDAnalysis.coordinates.RDKit import ( RDATTRIBUTES, _add_mda_attr_to_rdkit) except ImportError: - rdkit_installed = False - def mol2_mol(): pass def smiles_mol(): pass else: - rdkit_installed = True - def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) @@ -57,7 +54,16 @@ def smiles_mol(): return mol -requires_rdkit = pytest.mark.skipif(rdkit_installed == False, +@block_import('rdkit') +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + u = mda.Universe(mol2_molecule) + with pytest.raises(ImportError) as e: + u.atoms.convert_to("RDKIT") + assert "RDKit is required for the RDKitConverter" in str(e.value) + + +requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), reason="requires RDKit") @@ -192,12 +198,3 @@ def test_other_attributes(self, mol2, idx): rdprop = rdprops["_MDAnalysis_%s" % prop] mdaprop = getattr(mol2.atoms[idx], prop) assert rdprop == mdaprop - - -@pytest.mark.skipif(rdkit_installed == True, reason="test minimal dependency") -class TestRequiresRDKit(object): - def test_converter_requires_rdkit(self): - u = mda.Universe(mol2_molecule) - with pytest.raises(ImportError) as e: - u.atoms.convert_to("RDKIT") - assert "RDKit is required for the RDKitConverter" in str(e.value) From f5c798a574e8d9df5e0203462aad4a08d8b157d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 30 Jun 2020 18:37:26 +0200 Subject: [PATCH 22/90] move min deps test up --- .../MDAnalysisTests/coordinates/test_rdkit.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 35a85ad81de..857f984e0c0 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -32,6 +32,16 @@ from MDAnalysisTests.datafiles import mol2_molecule, PDB_full from MDAnalysisTests.util import block_import, import_not_available + +@block_import('rdkit') +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + u = mda.Universe(mol2_molecule) + with pytest.raises(ImportError) as e: + u.atoms.convert_to("RDKIT") + assert "RDKit is required for the RDKitConverter" in str(e.value) + + try: from rdkit import Chem from rdkit.Chem import AllChem @@ -54,15 +64,6 @@ def smiles_mol(): return mol -@block_import('rdkit') -class TestRequiresRDKit(object): - def test_converter_requires_rdkit(self): - u = mda.Universe(mol2_molecule) - with pytest.raises(ImportError) as e: - u.atoms.convert_to("RDKIT") - assert "RDKit is required for the RDKitConverter" in str(e.value) - - requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), reason="requires RDKit") From 558f7cd4efeacf02da4be60e2c0849bbcd0f3b54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 30 Jun 2020 19:39:09 +0200 Subject: [PATCH 23/90] capitalize elements by default --- package/MDAnalysis/coordinates/RDKit.py | 2 +- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 9302f7cd247..ab04600772f 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -209,7 +209,7 @@ def convert(self, obj): for i, (atom, element) in enumerate(zip(ag, elements)): # create atom - rdatom = Chem.Atom(element) + rdatom = Chem.Atom(element.capitalize()) # disable adding H to the molecule rdatom.SetNoImplicit(True) # add PDB-like properties diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 857f984e0c0..bdbaf0606ac 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -114,9 +114,8 @@ def pdb(self): def mol2(self): u = mda.Universe(mol2_molecule) # add elements - elements = np.array([ - guess_atom_element(x).capitalize() for x in u.atoms.types - ], dtype=object) + elements = np.array([guess_atom_element(x) for x in u.atoms.types], + dtype=object) u.add_TopologyAttr('elements', elements) return u From 984f507b54c04069aa5dec85cd9e6158bf0916c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 1 Jul 2020 20:09:58 +0200 Subject: [PATCH 24/90] store index in MDA object --- package/MDAnalysis/coordinates/RDKit.py | 1 + testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index ab04600772f..e34633b6f98 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -227,6 +227,7 @@ def convert(self, obj): rdatom.SetIntProp("_MDAnalysis_%s" % attr, int(value)) else: rdatom.SetProp("_MDAnalysis_%s" % attr, value) + rdatom.SetIntProp("_MDAnalysis_index", int(atom.ix)) # add atom index = mol.AddAtom(rdatom) # map index in universe to index in mol diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index bdbaf0606ac..9653525d074 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -198,3 +198,14 @@ def test_other_attributes(self, mol2, idx): rdprop = rdprops["_MDAnalysis_%s" % prop] mdaprop = getattr(mol2.atoms[idx], prop) assert rdprop == mdaprop + + @pytest.mark.parametrize("sel_str", [ + "resname ALA", + "resname PRO and segid A", + ]) + def test_index_property(self, pdb, sel_str): + ag = pdb.select_atoms(sel_str) + mol = ag.convert_to("RDKIT") + expected = ag.indices + indices = np.array([a.GetIntProp("_MDAnalysis_index") for a in mol.GetAtoms()], dtype=np.int32) + assert_equal(indices, expected) From e582e63a9a7cc62c74184c7910721a38a8fa0f35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 2 Jul 2020 00:17:58 +0200 Subject: [PATCH 25/90] fetch singular from _TOPOLOGY_ATTRS --- package/MDAnalysis/coordinates/RDKit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index e34633b6f98..89bb23b172a 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -63,6 +63,7 @@ from ..exceptions import NoDataError from ..topology.guessers import guess_atom_element +from ..core.topologyattrs import _TOPOLOGY_ATTRS from . import memory from . import base @@ -220,7 +221,7 @@ def convert(self, obj): # other properties for attr in other_attrs.keys(): value = other_attrs[attr][i] - attr = attr[:-1] # plural to singular + attr = _TOPOLOGY_ATTRS[attr].singular if isinstance(value, np.float): rdatom.SetDoubleProp("_MDAnalysis_%s" % attr, float(value)) elif isinstance(value, np.int): From 931a81b3e50dab20af7e82f50aa9f6c015f1275b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 2 Jul 2020 11:05:19 +0200 Subject: [PATCH 26/90] updated changelog --- package/CHANGELOG | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package/CHANGELOG b/package/CHANGELOG index a0aea381aac..67092ea8af0 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -32,6 +32,8 @@ Enhancements token (Issue #2468, PR #2707) * Added the `from_smiles` classmethod to the Universe (Issue #2468, PR #2707) * Added computation of Mean Squared Displacements (#2438, PR #2619) + * Added a simple version of the RDKitConverter that handles uncharged + molecules with bond orders/bond types assigned (#2468, PR #2775) Changes * Changes development status from Beta to Mature (Issue #2773) From c432ea96bde6438175aaf68f3f3e9d2d30b1cec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 3 Jul 2020 14:19:22 +0200 Subject: [PATCH 27/90] use ag intersection for bonds instead of try except --- package/MDAnalysis/coordinates/RDKit.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 89bb23b172a..d8a3a811417 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -246,12 +246,11 @@ def convert(self, obj): ag.guess_bonds() bonds = ag.bonds + # only keep bonds where both atoms belong to the AtomGroup + bonds = bonds.atomgroup_intersection(ag, strict=True) + for bond in bonds: - try: - bond_indices = [atom_mapper[i] for i in bond.indices] - except KeyError: - # one of the atoms of the bond is not part of the atomgroup - continue + bond_indices = [atom_mapper[i] for i in bond.indices] try: bond_type = bond.type.upper() except AttributeError: From b0c39ebdada098083289398f839826d0525bd189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 3 Jul 2020 16:16:44 +0200 Subject: [PATCH 28/90] more tests + pep8 --- package/MDAnalysis/coordinates/RDKit.py | 2 +- .../MDAnalysisTests/coordinates/test_rdkit.py | 29 +++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index d8a3a811417..f8e2e328a61 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -248,7 +248,7 @@ def convert(self, obj): # only keep bonds where both atoms belong to the AtomGroup bonds = bonds.atomgroup_intersection(ag, strict=True) - + for bond in bonds: bond_indices = [atom_mapper[i] for i in bond.indices] try: diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 9653525d074..901414f4d02 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -114,11 +114,35 @@ def pdb(self): def mol2(self): u = mda.Universe(mol2_molecule) # add elements - elements = np.array([guess_atom_element(x) for x in u.atoms.types], + elements = np.array([guess_atom_element(x) for x in u.atoms.types], dtype=object) u.add_TopologyAttr('elements', elements) return u + @pytest.fixture + def peptide(self): + mol = Chem.MolFromSequence("MDANALYSISandRDKIT") + u = mda.Universe(mol) + return u + + @pytest.mark.parametrize("smi", ["[H]", "C", "O", "[He]"]) + def test_single_atom_mol(self, smi): + u = mda.Universe.from_smiles(smi, addHs=False, + generate_coordinates=False) + mol = u.atoms.convert_to("RDKIT") + assert mol.GetNumAtoms() == 1 + + @pytest.mark.parametrize("resname, n_atoms, n_fragments", [ + ("MET", 8, 1), + ("THR", 8, 1), + ("ILE", 16, 2), + ("ASP", 24, 3), + ]) + def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): + mol = peptide.select_atoms("resname %s" % resname).convert_to("RDKIT") + assert n_atoms == mol.GetNumAtoms() + assert n_fragments == len(Chem.GetMolFrags(mol)) + @pytest.mark.parametrize("sel_str, atom_index", [ ("resid 1", 0), ("resname LYS and name NZ", 1), @@ -207,5 +231,6 @@ def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) mol = ag.convert_to("RDKIT") expected = ag.indices - indices = np.array([a.GetIntProp("_MDAnalysis_index") for a in mol.GetAtoms()], dtype=np.int32) + indices = np.array([a.GetIntProp("_MDAnalysis_index") + for a in mol.GetAtoms()], dtype=np.int32) assert_equal(indices, expected) From dc0c2761c90561b4774055549172ff7f935b85ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 6 Jul 2020 17:15:45 +0200 Subject: [PATCH 29/90] fix test for numpy 1.13.3 --- .../MDAnalysisTests/coordinates/test_rdkit.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 901414f4d02..8f17d01a0b7 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -29,7 +29,7 @@ from numpy.testing import (assert_equal, assert_almost_equal) -from MDAnalysisTests.datafiles import mol2_molecule, PDB_full +from MDAnalysisTests.datafiles import mol2_molecule, PDB_full, GRO from MDAnalysisTests.util import block_import, import_not_available @@ -121,9 +121,10 @@ def mol2(self): @pytest.fixture def peptide(self): - mol = Chem.MolFromSequence("MDANALYSISandRDKIT") - u = mda.Universe(mol) - return u + u = mda.Universe(GRO) + elements = mda.topology.guessers.guess_types(u.atoms.names) + u.add_TopologyAttr('elements', elements) + return u.select_atoms("resid 2-12") @pytest.mark.parametrize("smi", ["[H]", "C", "O", "[He]"]) def test_single_atom_mol(self, smi): @@ -133,10 +134,10 @@ def test_single_atom_mol(self, smi): assert mol.GetNumAtoms() == 1 @pytest.mark.parametrize("resname, n_atoms, n_fragments", [ - ("MET", 8, 1), - ("THR", 8, 1), - ("ILE", 16, 2), - ("ASP", 24, 3), + ("PRO", 14, 1), + ("ILE", 38, 1), + ("ALA", 20, 2), + ("GLY", 21, 3), ]) def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): mol = peptide.select_atoms("resname %s" % resname).convert_to("RDKIT") From 0fd38b0642b9b1ecd94237b0bb4fb0fcfa7451cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 7 Jul 2020 11:52:04 +0200 Subject: [PATCH 30/90] docs + pep8 --- package/MDAnalysis/coordinates/RDKit.py | 20 +++++++++++-------- .../source/documentation_pages/converters.rst | 1 + .../converters/RDKitParser.rst | 3 +++ .../topology/RDKitParser.rst | 1 - .../documentation_pages/topology_modules.rst | 1 - .../MDAnalysisTests/coordinates/test_rdkit.py | 4 ++-- 6 files changed, 18 insertions(+), 12 deletions(-) create mode 100644 package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst delete mode 100644 package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index f8e2e328a61..29b6a99e685 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -21,11 +21,11 @@ # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 # -"""RDKit molecule --- :mod:`MDAnalysis.coordinates.RDKit` +"""RDKit molecule I/O --- :mod:`MDAnalysis.coordinates.RDKit` ================================================================ -Read coordinates data from an `RDKit `_ :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` -into a MDAnalysis Universe. Convert it back to a :class:`rdkit.Chem.rdchem.Mol` with +Read coordinates data from an `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` +into an MDAnalysis Universe. Convert it back to an :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. @@ -53,6 +53,8 @@ .. autoclass:: RDKitConverter :members: +.. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol + """ @@ -124,7 +126,7 @@ def _format_hint(thing): def __init__(self, filename, **kwargs): """Read coordinates from an RDKit molecule. - Each conformer in the original RDKit molecule will be read as a frame + Each conformer in the original RDKit molecule will be read as a frame in the resulting universe. Parameters @@ -145,7 +147,7 @@ def __init__(self, filename, **kwargs): class RDKitConverter(base.ConverterBase): - """Convert MDAnalysis AtomGroup or Universe to `RDKit `_ :class:`rdkit.Chem.rdchem.Mol`. + """Convert MDAnalysis AtomGroup or Universe to `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` Example ------- @@ -159,13 +161,15 @@ class RDKitConverter(base.ConverterBase): .. versionadded:: 2.0.0 + + .. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol """ lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} def convert(self, obj): - """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. + """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. Parameters ----------- @@ -267,7 +271,7 @@ def convert(self, obj): def _add_mda_attr_to_rdkit(attr, value, mi): - """Converts an MDAnalysis atom attribute into the RDKit equivalent and + """Converts an MDAnalysis atom attribute into the RDKit equivalent and stores it into an RDKit AtomPDBResidueInfo object. Parameters @@ -286,7 +290,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): if attr == "names": # RDKit needs the name to be properly formated for a # PDB file (1 letter elements start at col 14) - name = re.findall('(\D+|\d+)', value) + name = re.findall(r'(\D+|\d+)', value) if len(name) == 2: symbol, number = name else: diff --git a/package/doc/sphinx/source/documentation_pages/converters.rst b/package/doc/sphinx/source/documentation_pages/converters.rst index 8bdcb913fc0..c70d2324184 100644 --- a/package/doc/sphinx/source/documentation_pages/converters.rst +++ b/package/doc/sphinx/source/documentation_pages/converters.rst @@ -33,4 +33,5 @@ you will have to specify a package name (case-insensitive). :: :maxdepth: 1 converters/ParmEdParser + converters/RDKitParser diff --git a/package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst b/package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst new file mode 100644 index 00000000000..174a1ff1115 --- /dev/null +++ b/package/doc/sphinx/source/documentation_pages/converters/RDKitParser.rst @@ -0,0 +1,3 @@ +.. automodule:: MDAnalysis.topology.RDKitParser + +.. automodule:: MDAnalysis.coordinates.RDKit diff --git a/package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst b/package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst deleted file mode 100644 index 9f6bbd1dac9..00000000000 --- a/package/doc/sphinx/source/documentation_pages/topology/RDKitParser.rst +++ /dev/null @@ -1 +0,0 @@ -.. automodule:: MDAnalysis.topology.RDKitParser \ No newline at end of file diff --git a/package/doc/sphinx/source/documentation_pages/topology_modules.rst b/package/doc/sphinx/source/documentation_pages/topology_modules.rst index 04ebf433e27..ed8caba8ce6 100644 --- a/package/doc/sphinx/source/documentation_pages/topology_modules.rst +++ b/package/doc/sphinx/source/documentation_pages/topology_modules.rst @@ -43,7 +43,6 @@ topology file format in the *topology_format* keyword argument to topology/PDBQTParser topology/PQRParser topology/PSFParser - topology/RDKitParser topology/TOPParser topology/TPRParser topology/TXYZParser diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 8f17d01a0b7..3e5c66ae640 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -180,8 +180,8 @@ def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) with pytest.raises(AttributeError) as e: u.atoms.convert_to("RDKIT") - assert "`elements` attribute is required for the RDKitConverter" in str( - e.value) + assert ("`elements` attribute is required for the RDKitConverter" + in str(e.value)) def test_warn_guess_bonds(self, pdb): pdb.delete_bonds(pdb.bonds) From 25ee227c864803e4b4006d0afc38b2348abf000e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 7 Jul 2020 12:00:04 +0200 Subject: [PATCH 31/90] fix too long atom names in PDB output --- package/MDAnalysis/coordinates/RDKit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 29b6a99e685..6810dd8c616 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -295,7 +295,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): symbol, number = name else: symbol, number = name[0], "" - value = "{:>2}{:<2}".format(symbol, number) + value = "{:>2.2}{:<2.2}".format(symbol, number) # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] getattr(mi, "Set%s" % rdattr)(value) From cfce2781fd7845096a78a5a442a0294f5d1c982e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 7 Jul 2020 14:31:59 +0200 Subject: [PATCH 32/90] table of supported attributes for the converter --- package/MDAnalysis/coordinates/RDKit.py | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 6810dd8c616..a4b301a498c 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -149,6 +149,49 @@ def __init__(self, filename, **kwargs): class RDKitConverter(base.ConverterBase): """Convert MDAnalysis AtomGroup or Universe to `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` + MDanalysis attributes are stored in each RDKit atom of the resulting + molecule in two different ways: + + * in an `AtomPDBResidueInfo` object available through the + ``atom.GetMonomerInfo()`` method if it's an attribute that is typically + found in a PDB file, + * directly as an atom property available through the + ``atom.GetPropsAsDict()`` method for the others. + + Supported attributes: + + +-----------------------+-------------------------------------------+ + | MDAnalysis attribute | RDKit | + +=======================+===========================================+ + | altLocs | atom.GetMonomerInfo().GetAltLoc() | + +-----------------------+-------------------------------------------+ + | chainIDs | atom.GetMonomerInfo().GetChainId() | + +-----------------------+-------------------------------------------+ + | icodes | atom.GetMonomerInfo().GetInsertionCode() | + +-----------------------+-------------------------------------------+ + | names | atom.GetMonomerInfo().GetName() | + +-----------------------+-------------------------------------------+ + | occupancies | atom.GetMonomerInfo().GetOccupancy() | + +-----------------------+-------------------------------------------+ + | resnames | atom.GetMonomerInfo().GetResidueName() | + +-----------------------+-------------------------------------------+ + | resids | atom.GetMonomerInfo().GetResidueNumber() | + +-----------------------+-------------------------------------------+ + | segindices | atom.GetMonomerInfo().GetSegmentNumber() | + +-----------------------+-------------------------------------------+ + | tempfactors | atom.GetMonomerInfo().GetTempFactor() | + +-----------------------+-------------------------------------------+ + | bfactors | atom.GetDoubleProp("_MDAnalysis_bfactor") | + +-----------------------+-------------------------------------------+ + | charges | atom.GetDoubleProp("_MDAnalysis_charge") | + +-----------------------+-------------------------------------------+ + | indices | atom.GetIntProp("_MDAnalysis_index") | + +-----------------------+-------------------------------------------+ + | segids | atom.GetProp("_MDAnalysis_segid") | + +-----------------------+-------------------------------------------+ + | types | atom.GetProp("_MDAnalysis_type") | + +-----------------------+-------------------------------------------+ + Example ------- From 3717f574110fbf5c6105dd81ce4d16d8a2048e5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 7 Jul 2020 14:32:54 +0200 Subject: [PATCH 33/90] changelog convention --- package/CHANGELOG | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 1f2b216028f..42e1a02c562 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -37,7 +37,7 @@ Enhancements * Improved performances when parsing TPR files (PR #2804) * Added converter between Cartesian and Bond-Angle-Torsion coordinates (PR #2668) * Added a simple version of the RDKitConverter that handles uncharged - molecules with bond orders/bond types assigned (#2468, PR #2775) + molecules with bond orders/bond types assigned (Issue #2468, PR #2775) Changes * Changes development status from Beta to Mature (Issue #2773) From 499010a565575318563311b65a651d0a5ae664ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 7 Jul 2020 14:45:16 +0200 Subject: [PATCH 34/90] catch no bonds --- package/MDAnalysis/coordinates/RDKit.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index a4b301a498c..57ecde242b7 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -159,7 +159,7 @@ class RDKitConverter(base.ConverterBase): ``atom.GetPropsAsDict()`` method for the others. Supported attributes: - + +-----------------------+-------------------------------------------+ | MDAnalysis attribute | RDKit | +=======================+===========================================+ @@ -282,19 +282,15 @@ def convert(self, obj): atom_mapper[atom.ix] = index try: - bonds = ag.bonds - if (len(bonds) == 0) and (ag.n_atoms > 1): - # force guessing bonds - raise NoDataError - except NoDataError: + ag.bonds.values() + except (NoDataError, IndexError): warnings.warn( "No `bonds` attribute in this AtomGroup. Guessing bonds based" "on atoms coordinates") ag.guess_bonds() - bonds = ag.bonds # only keep bonds where both atoms belong to the AtomGroup - bonds = bonds.atomgroup_intersection(ag, strict=True) + bonds = ag.bonds.atomgroup_intersection(ag, strict=True) for bond in bonds: bond_indices = [atom_mapper[i] for i in bond.indices] From 9604b6aaa4a118f07cc46e4c24788d5b4834210c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 7 Jul 2020 14:54:16 +0200 Subject: [PATCH 35/90] use match in pytest.raises and pytest.warns --- .../MDAnalysisTests/coordinates/test_rdkit.py | 30 ++++++------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 3e5c66ae640..6cb4e45b8b7 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -20,7 +20,6 @@ # MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 # -import warnings import pytest import MDAnalysis as mda @@ -37,9 +36,9 @@ class TestRequiresRDKit(object): def test_converter_requires_rdkit(self): u = mda.Universe(mol2_molecule) - with pytest.raises(ImportError) as e: + with pytest.raises(ImportError, + match="RDKit is required for the RDKitConverter"): u.atoms.convert_to("RDKIT") - assert "RDKit is required for the RDKitConverter" in str(e.value) try: @@ -83,15 +82,8 @@ def test_coordinates(self, rdmol, n_frames): assert_equal(expected, universe.trajectory.coordinate_array) def test_no_coordinates(self): - with warnings.catch_warnings(record=True) as w: - # Cause all warnings to always be triggered. - warnings.simplefilter("always") - # Trigger a warning. + with pytest.warns(UserWarning, match="No coordinates found"): u = mda.Universe.from_smiles("CCO", generate_coordinates=False) - # Verify the warning - assert len(w) == 1 - assert "No coordinates found" in str( - w[-1].message) expected = np.empty((1, u.atoms.n_atoms, 3), dtype=np.float32) expected[:] = np.nan assert_equal(u.trajectory.coordinate_array, expected) @@ -178,23 +170,19 @@ def test_identical_topology(self): def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) - with pytest.raises(AttributeError) as e: + with pytest.raises( + AttributeError, + match="`elements` attribute is required for the RDKitConverter" + ): u.atoms.convert_to("RDKIT") - assert ("`elements` attribute is required for the RDKitConverter" - in str(e.value)) def test_warn_guess_bonds(self, pdb): pdb.delete_bonds(pdb.bonds) ag = pdb.select_atoms("resnum 101 and segid A") pdb.delete_bonds(ag.bonds) - with warnings.catch_warnings(record=True) as w: - # Cause all warnings to always be triggered. - warnings.simplefilter("always") - # trigger warning + with pytest.warns(UserWarning, + match="No `bonds` attribute in this AtomGroup"): ag.convert_to("RDKIT") - assert len(w) == 1 - assert "No `bonds` attribute in this AtomGroup" in str( - w[-1].message) @pytest.mark.parametrize("attr, value, expected", [ ("names", "C1", " C1 "), From 30871ab027f09fe960550eaabf1dcde0297b1ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 8 Jul 2020 11:31:58 +0200 Subject: [PATCH 36/90] remove bondtype attribute --- package/MDAnalysis/coordinates/RDKit.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 57ecde242b7..344063b9a2a 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -74,12 +74,6 @@ except ImportError: pass else: - RDBONDTYPE = { - 'AROMATIC': Chem.BondType.AROMATIC, - 'SINGLE': Chem.BondType.SINGLE, - 'DOUBLE': Chem.BondType.DOUBLE, - 'TRIPLE': Chem.BondType.TRIPLE, - } RDBONDORDER = { 1: Chem.BondType.SINGLE, 1.5: Chem.BondType.AROMATIC, @@ -294,13 +288,7 @@ def convert(self, obj): for bond in bonds: bond_indices = [atom_mapper[i] for i in bond.indices] - try: - bond_type = bond.type.upper() - except AttributeError: - # bond type can be a tuple for PDB files - bond_type = None - bond_type = RDBONDTYPE.get(bond_type, RDBONDORDER.get( - bond.order, Chem.BondType.SINGLE)) + bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) mol.AddBond(*bond_indices, bond_type) # sanitization From 9c0a42e781678156ce4c2cb3b2474cb2c87b6c96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 8 Jul 2020 16:24:52 +0200 Subject: [PATCH 37/90] fix bond attributes --- package/MDAnalysis/coordinates/RDKit.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 344063b9a2a..c5b4747ac97 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -276,10 +276,12 @@ def convert(self, obj): atom_mapper[atom.ix] = index try: - ag.bonds.values() - except (NoDataError, IndexError): + if (len(ag.bonds) == 0) and (ag.n_atoms > 1): + # force guessing bonds + raise NoDataError + except NoDataError: warnings.warn( - "No `bonds` attribute in this AtomGroup. Guessing bonds based" + "No `bonds` attribute in this AtomGroup. Guessing bonds based " "on atoms coordinates") ag.guess_bonds() From 1bbfb0973f32bf5c7a54fc2ce20d30e66e34f183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 8 Jul 2020 19:35:30 +0200 Subject: [PATCH 38/90] simplified the code for infering --- package/MDAnalysis/coordinates/RDKit.py | 101 +++++++----------- .../MDAnalysisTests/coordinates/test_rdkit.py | 5 +- 2 files changed, 42 insertions(+), 64 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 31eebd87b1e..f7b48452e49 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -346,22 +346,19 @@ def _add_mda_attr_to_rdkit(attr, value, mi): def _infer_bo_and_charges(mol, terminal_atom_indices=[]): """Infer bond orders and formal charges from a molecule. - - Step 1 Since most MD topology files don't explicitely retain informations on bond orders or charges, it has to be guessed from the topology. This is done by looping other each atom and comparing its expected valence to the current - valence, called `delta_v`. If two neighbouring atoms have a common - positive delta_v, the bond between them most likely has a bond order of - 1+delta_v. If an atom doesn't share a delta_v with any of its neighbours, - it likely needs a formal charge of -delta_v. - - - Step 2 - Some atoms can be "mutilated" by a selection (i.e. one of their bonds is - cut). The previous step is likely to assign a negative charge to such atoms - even if they weren't charged in the original topology. This step converts - the resulting charges to higher order bonds when possible, or to radical - electrons. This ensures the atomgroup is not artificially charged because - of the previous step. + valence to get the Number of Unpaired Electrons (NUE). + If an atom has a negative NUE, it needs a positive formal charge (-NUE). + If two neighbouring atoms have the same NUE, the bond between them most + likely has to be increased by the value of NUE. + If an atom doesn't share a common NUE with any of its neighbours, it's + either a radical (because one its bonds was cut when creating the + AtomGroup) or it needs a negative formal charge of -NUE. Since these + radical atoms can be detected when looping over the bonds of the AtomGroup, + only atoms that are not part of this "terminal_atoms" list will be assigned + a negative formal charge. Parameters ---------- @@ -372,70 +369,48 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): terminal_atom_indices : list List of terminal atoms indices, i.e. atoms at the edges of a molecule """ - # Step 1 + for atom in mol.GetAtoms(): - # create delta_v for each possible valence + # get NUE for each possible valence expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) current_v = atom.GetTotalValence() - delta_vs = [expected_v - current_v for expected_v in expected_vs] + nue = [expected_v - current_v for expected_v in expected_vs] - # if there's only one possible valence state and the correpsonding - # delta_v is negative, it means we can only add a positive charge to + # if there's only one possible valence state and the corresponding + # NUE is negative, it means we can only add a positive charge to # the atom - if (len(delta_vs) == 1) and (delta_vs[0] < 0): - charge = -delta_vs[0] - atom.SetFormalCharge(charge) + if (len(nue) == 1) and (nue[0] < 0): + atom.SetFormalCharge(-nue[0]) mol.UpdatePropertyCache(strict=False) else: neighbors = atom.GetNeighbors() - # check if one of the neighbors has a common delta_v + # check if one of the neighbors has a common NUE for i, na in enumerate(neighbors, start=1): - # create delta_v for the neighbor + # create NUE for the neighbor na_expected_vs = PERIODIC_TABLE.GetValenceList( na.GetAtomicNum()) na_current = na.GetTotalValence() - na_delta = [ + na_nue = [ na_expected - na_current for na_expected in na_expected_vs] - # smallest common delta_v, else NaN - common_delta = min(set(delta_vs).intersection(na_delta), - default=np.nan) - # common_delta == 0 means we don't need to do anything - if common_delta != 0: - # if they have no delta_v in common - if common_delta is np.nan: - # if it's the last neighbor - if i == len(neighbors): - charge = -delta_vs[0] # negative - atom.SetFormalCharge(charge) - mol.UpdatePropertyCache(strict=False) - # if they both need a supplementary bond + # smallest common NUE, else None + common_nue = min(set(nue).intersection(na_nue), default=None) + # a common NUE of 0 means we don't need to do anything + if common_nue != 0: + # if they have no NUE in common + if common_nue is None: + # # if we've already tried all the neighbors without a solution + # if i == len(neighbors): + # # if it's an edge atom + # if len(neighbors) <= 1 or atom.GetIdx() in terminal_atom_indices: + # # negative charge + # atom.SetFormalCharge(-nue[0]) + # atom.SetNumRadicalElectrons(0) + # mol.UpdatePropertyCache(strict=False) + # break + pass else: bond = mol.GetBondBetweenAtoms( atom.GetIdx(), na.GetIdx()) - bond.SetBondType(RDBONDORDER[common_delta+1]) + bond.SetBondType(RDBONDORDER[common_nue + 1]) mol.UpdatePropertyCache(strict=False) - break # out of neighbors loop - - # Step 2 - for i in terminal_atom_indices: - atom = mol.GetAtomWithIdx(i) - charge = atom.GetFormalCharge() - neighbors = atom.GetNeighbors() - # check if a neighbor atom also bears a charge - for i, na in enumerate(neighbors, 1): - na_charge = na.GetFormalCharge() - if na_charge < 0: - # both atoms have a negative charge - # convert to higher order bond - common_delta = max([charge, na_charge]) - bond = mol.GetBondBetweenAtoms(atom.GetIdx(), na.GetIdx()) - bond.SetBondType(RDBONDORDER[-common_delta+1]) - na.SetFormalCharge(na_charge - common_delta) - atom.SetFormalCharge(0) - atom.SetNumRadicalElectrons(common_delta - charge) - break - elif i == len(neighbors): - # no neighbor shares a negative charge - atom.SetNumRadicalElectrons(-atom.GetFormalCharge()) - atom.SetFormalCharge(0) - mol.UpdatePropertyCache(strict=False) + break diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index af287431fc8..160c5508fdd 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -239,12 +239,14 @@ class TestRDKitFunctions(object): ]) def test_infer_bond_orders(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) + mol.UpdatePropertyCache(strict=False) _infer_bo_and_charges(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) assert mol.HasSubstructMatch( molref) and molref.HasSubstructMatch(mol) + @pytest.mark.skip(reason="not fully working yet") @pytest.mark.parametrize("smi, atom, charge", [ ("C-[O]", "O", -1), ("[N]-[C]-[O]", "O", -1), @@ -252,6 +254,7 @@ def test_infer_bond_orders(self, smi, out): ]) def test_infer_charges(self, smi, atom, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) - _infer_bo_and_charges(mol, [0]) + mol.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(mol) index = mol.GetSubstructMatch(Chem.MolFromSmarts(atom))[0] assert mol.GetAtomWithIdx(index).GetFormalCharge() == charge From 4a79cfd6c1f2be9f589679d16e6a5faca991be5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 9 Jul 2020 18:02:49 +0200 Subject: [PATCH 39/90] include negative charges --- package/MDAnalysis/coordinates/RDKit.py | 60 ++++++++++--------- .../MDAnalysisTests/coordinates/test_rdkit.py | 33 ++++++---- 2 files changed, 54 insertions(+), 39 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index f7b48452e49..34fc25745c5 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -294,8 +294,8 @@ def convert(self, obj): # can happen for terminal atoms. # save the bond atom that is in the atomgroup for later terminal_atom_indices.extend([atom_mapper[i] - for i in bond.indices - if i in atom_mapper.keys()]) + for i in bond.indices + if i in atom_mapper.keys()]) # skip adding this bond continue bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) @@ -374,7 +374,7 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): # get NUE for each possible valence expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) current_v = atom.GetTotalValence() - nue = [expected_v - current_v for expected_v in expected_vs] + nue = [v - current_v for v in expected_vs] # if there's only one possible valence state and the corresponding # NUE is negative, it means we can only add a positive charge to @@ -382,35 +382,41 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): if (len(nue) == 1) and (nue[0] < 0): atom.SetFormalCharge(-nue[0]) mol.UpdatePropertyCache(strict=False) + continue else: neighbors = atom.GetNeighbors() # check if one of the neighbors has a common NUE for i, na in enumerate(neighbors, start=1): - # create NUE for the neighbor + # get NUE for the neighbor na_expected_vs = PERIODIC_TABLE.GetValenceList( na.GetAtomicNum()) - na_current = na.GetTotalValence() - na_nue = [ - na_expected - na_current for na_expected in na_expected_vs] - # smallest common NUE, else None - common_nue = min(set(nue).intersection(na_nue), default=None) + na_current_v = na.GetTotalValence() + na_nue = [v - na_current_v for v in na_expected_vs] + # smallest common NUE + common_nue = min( + min([i for i in nue if i >= 0], default=0), + min([i for i in na_nue if i >= 0], default=0) + ) # a common NUE of 0 means we don't need to do anything if common_nue != 0: - # if they have no NUE in common - if common_nue is None: - # # if we've already tried all the neighbors without a solution - # if i == len(neighbors): - # # if it's an edge atom - # if len(neighbors) <= 1 or atom.GetIdx() in terminal_atom_indices: - # # negative charge - # atom.SetFormalCharge(-nue[0]) - # atom.SetNumRadicalElectrons(0) - # mol.UpdatePropertyCache(strict=False) - # break - pass - else: - bond = mol.GetBondBetweenAtoms( - atom.GetIdx(), na.GetIdx()) - bond.SetBondType(RDBONDORDER[common_nue + 1]) - mol.UpdatePropertyCache(strict=False) - break + # increase bond order + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx()) + order = common_nue + 1 + bond.SetBondType(RDBONDORDER[order]) + mol.UpdatePropertyCache(strict=False) + if i < len(neighbors): + # recalculate nue for atom + current_v = atom.GetTotalValence() + nue = [v - current_v for v in expected_vs] + + # if the atom still has unpaired electrons + current_v = atom.GetTotalValence() + nue = [v - current_v for v in expected_vs][0] + if nue > 0: + # keep the radical if it's a terminal atom + # else transform it to a negative charge + if atom.GetIdx() not in terminal_atom_indices: + atom.SetFormalCharge(-nue) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 160c5508fdd..218ac8b302b 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -181,7 +181,7 @@ def test_warn_guess_bonds(self, pdb): pdb.delete_bonds(pdb.bonds) ag = pdb.select_atoms("resnum 101 and segid A") pdb.delete_bonds(ag.bonds) - with pytest.warns(UserWarning, + with pytest.warns(UserWarning, match="No `bonds` attribute in this AtomGroup"): ag.convert_to("RDKIT") @@ -228,33 +228,42 @@ def test_index_property(self, pdb, sel_str): @requires_rdkit class TestRDKitFunctions(object): - @pytest.mark.parametrize("smi, out", [ - ("[H]C([H])([H])[H]", "C"), - ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", "c1ccccc1"), - ("[Cl]-[C](-[H])-[O]", "C(=O)Cl"), - ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), - ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), - ("[P](-O)(-O)(-O)-[O]", "P(O)(O)(O)=O"), - ("[N]-[C]-[H]", "N#C"), + @pytest.mark.parametrize("smi, edges, out", [ + ("C(-[H])(-[H])(-[H])-[H]", [], "C"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", [], "c1ccccc1"), + ("[Cl]-[C](-[H])-[O]", [], "C(=O)Cl"), + ("[H]-[C](-[O])-[N](-[H])-[H]", [], "C(=O)N"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", [], "C=C"), + ("[P](-O)(-O)(-O)-[O]", [], "P(O)(O)(O)=O"), + ("[P](-[O])(-[O])(-[O])-O", [], "O=P([O-])([O-])O"), + ("[N]-[C]-[H]", [], "N#C"), + ("[C](-[H])(-[H])-[Cl]", [0], "[H][C]([H])Cl"), + ("[C](-[H])-[C](-[H])-[H]", [0], "[H][C]=C([H])[H]"), + ("[C](-[H])-[Cl]", [0], "[H][C]Cl"), + ("[C](-[O])-[Cl]", [0], "O=[C]Cl"), + #("[S](-[O])(-[O])(-[Cl])-[Cl]", [], "O=S(=O)(Cl)Cl"), + #("[S](-[O])(-[O])-[Cl]", [0], "O=[S](=O)Cl"), ]) - def test_infer_bond_orders(self, smi, out): + def test_infer_bond_orders(self, smi, edges, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol) + _infer_bo_and_charges(mol, edges) + Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) assert mol.HasSubstructMatch( molref) and molref.HasSubstructMatch(mol) - @pytest.mark.skip(reason="not fully working yet") @pytest.mark.parametrize("smi, atom, charge", [ ("C-[O]", "O", -1), ("[N]-[C]-[O]", "O", -1), ("[N](-[H])(-[H])(-[H])-[H]", "N", 1), + ("[O]-[C](-[H])-[C](-[H])-[H]", "O", -1), ]) def test_infer_charges(self, smi, atom, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) _infer_bo_and_charges(mol) + Chem.SanitizeMol(mol) index = mol.GetSubstructMatch(Chem.MolFromSmarts(atom))[0] assert mol.GetAtomWithIdx(index).GetFormalCharge() == charge From bd8890849100622aacacd66f47d97ae1a951a87c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 10 Jul 2020 11:37:23 +0200 Subject: [PATCH 40/90] skip failing test for now --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 218ac8b302b..973d976efdd 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -258,7 +258,7 @@ def test_infer_bond_orders(self, smi, edges, out): ("C-[O]", "O", -1), ("[N]-[C]-[O]", "O", -1), ("[N](-[H])(-[H])(-[H])-[H]", "N", 1), - ("[O]-[C](-[H])-[C](-[H])-[H]", "O", -1), + #("[O]-[C](-[H])-[C](-[H])-[H]", "O", -1), ]) def test_infer_charges(self, smi, atom, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) From d9e088e6a2ea02e79d869cf870edd3ef70d6e9c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 10 Jul 2020 12:40:04 +0200 Subject: [PATCH 41/90] new test --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 973d976efdd..83c752b6efc 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -241,8 +241,9 @@ class TestRDKitFunctions(object): ("[C](-[H])-[C](-[H])-[H]", [0], "[H][C]=C([H])[H]"), ("[C](-[H])-[Cl]", [0], "[H][C]Cl"), ("[C](-[O])-[Cl]", [0], "O=[C]Cl"), - #("[S](-[O])(-[O])(-[Cl])-[Cl]", [], "O=S(=O)(Cl)Cl"), - #("[S](-[O])(-[O])-[Cl]", [0], "O=[S](=O)Cl"), + #("[S](-[O])(-[O])(-[O]-C)-C", [], "COS(=O)(=O)C"), + #("[S](-[O])(-[O])-C", [0], "O=[S](=O)C"), + #("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", [], "CNC(N)=[N+](-[H])-[H]"), ]) def test_infer_bond_orders(self, smi, edges, out): mol = Chem.MolFromSmiles(smi, sanitize=False) From f6aa7361c25e7e87dc0de8877a0785cebe9562f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 10 Jul 2020 13:05:21 +0200 Subject: [PATCH 42/90] update doc --- package/MDAnalysis/coordinates/RDKit.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 34fc25745c5..d03c2606bb3 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -351,14 +351,13 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): looping other each atom and comparing its expected valence to the current valence to get the Number of Unpaired Electrons (NUE). If an atom has a negative NUE, it needs a positive formal charge (-NUE). - If two neighbouring atoms have the same NUE, the bond between them most - likely has to be increased by the value of NUE. - If an atom doesn't share a common NUE with any of its neighbours, it's - either a radical (because one its bonds was cut when creating the - AtomGroup) or it needs a negative formal charge of -NUE. Since these - radical atoms can be detected when looping over the bonds of the AtomGroup, - only atoms that are not part of this "terminal_atoms" list will be assigned - a negative formal charge. + If two neighbouring atoms have UEs, the bond between them most + likely has to be increased by the value of the smallest NUE. + If after this process, an atom still has UEs, it's either a radical + (because one its bonds was cut when creating the AtomGroup) or it needs a + negative formal charge of -NUE. Since these radical atoms can be detected + when looping over the bonds of the AtomGroup, only atoms that are not part + of this "terminal_atoms" list will be assigned a negative formal charge. Parameters ---------- From b943af1c85ca17559cefe71a218069f27828d1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 13 Jul 2020 19:33:24 +0200 Subject: [PATCH 43/90] standardize groups --- package/MDAnalysis/coordinates/RDKit.py | 77 ++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 7 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index d03c2606bb3..18cb4fd9dd4 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -71,6 +71,7 @@ try: from rdkit import Chem + from rdkit.Chem import AllChem except ImportError: pass else: @@ -263,13 +264,8 @@ def convert(self, obj): for attr in other_attrs.keys(): value = other_attrs[attr][i] attr = _TOPOLOGY_ATTRS[attr].singular - if isinstance(value, np.float): - rdatom.SetDoubleProp("_MDAnalysis_%s" % attr, float(value)) - elif isinstance(value, np.int): - rdatom.SetIntProp("_MDAnalysis_%s" % attr, int(value)) - else: - rdatom.SetProp("_MDAnalysis_%s" % attr, value) - rdatom.SetIntProp("_MDAnalysis_index", int(atom.ix)) + _set_atom_property(rdatom, attr, value) + _set_atom_property(rdatom, "index", int(atom.ix)) # add atom index = mol.AddAtom(rdatom) # map index in universe to index in mol @@ -305,6 +301,7 @@ def convert(self, obj): # infer bond orders and formal charges from the connectivity _infer_bo_and_charges(mol, terminal_atom_indices) + mol = _standardize_patterns(mol) # sanitize Chem.SanitizeMol(mol) @@ -343,6 +340,16 @@ def _add_mda_attr_to_rdkit(attr, value, mi): getattr(mi, "Set%s" % rdattr)(value) +def _set_atom_property(atom, attr, value): + """Converts an MDAnalysis atom attribute into an RDKit atom property""" + if isinstance(value, (float, np.float)): + atom.SetDoubleProp("_MDAnalysis_%s" % attr, float(value)) + elif isinstance(value, (int, np.int)): + atom.SetIntProp("_MDAnalysis_%s" % attr, int(value)) + else: + atom.SetProp("_MDAnalysis_%s" % attr, value) + + def _infer_bo_and_charges(mol, terminal_atom_indices=[]): """Infer bond orders and formal charges from a molecule. @@ -419,3 +426,59 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): atom.SetFormalCharge(-nue) atom.SetNumRadicalElectrons(0) mol.UpdatePropertyCache(strict=False) + + +def _standardize_patterns(mol): + """Standardize functional groups using reactions from SMARTS patterns + + Because the two NH2 groups in arginine are identical, during the reaction the pattern is matched 2 times and produces 2 identical products per ARG residue, which results in as many molecules as there are ARG residues when we use Chem.CombineMols. For this reason, we have to run the ARG reaction and keep the first product as many times as their are arginine residues in the molecule. + The same logic applies to all reactions that imply several identical + moieties (like the two =O in sulfones). + """ + reactions = [ + ("Cterm", "[C-;v3:1]=[O:2]>>[C;+0:1]=[O:2]"), + ("Nterm", "[N-;v2;H1:1]>>[N;+0:1]"), + ("keto-enolate", "[C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3]"), + ] + # arginine + pattern = Chem.MolFromSmarts("[N;H1:1]-[C-;v3:2](-[N;H2:3])-[N;H2:4]") + n = len(mol.GetSubstructMatches(pattern)) + reactions.extend( + n * [("ARG", "[N;H1:1]-[C-;v3:2](-[N;H2:3])-[N;H2:4]" + ">>[N:1]-[C;+0:2](-[N:3])=[N;+1:4]")] + ) + # sulfone + pattern = Chem.MolFromSmarts("[S;v4:1](-[O-;v1:2])-[O-;v1:3]") + n = len(mol.GetSubstructMatches(pattern)) + reactions.extend( + n * [("sulfone", "[S;v4:1](-[O-;v1:2])-[O-;v1:3]" + ">>[S;v6:1](=[O;+0:2])=[O;+0:3]")] + ) + for name, reaction in reactions: + rxn = AllChem.ReactionFromSmarts(reaction) + products = rxn.RunReactants((mol,)) + # product is an empty tuple if there were no matching atoms in the mol + if products: + if name in ["ARG", "sulfone"]: + products = products[::len(products)] + product = products[0][0] + for p in products[1:]: + product = Chem.CombineMols(product, p[0]) + # map back atomic properties to the transformed atoms + for atom in product.GetAtoms(): + try: + atom.GetIntProp("old_mapno") + except KeyError: + pass + else: + atom.ClearProp("old_mapno") + idx = atom.GetUnsignedProp("react_atom_idx") + old_atom = mol.GetAtomWithIdx(idx) + for prop, value in old_atom.GetPropsAsDict().items(): + if prop.startswith("_MDAnalysis"): + attr = prop.split("_")[-1] + _set_atom_property(atom, attr, value) + atom.ClearProp("react_atom_idx") + mol = product + mol.UpdatePropertyCache(strict=False) + return mol From 0105419a50cab38d577497fd85008056912710eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 13 Jul 2020 19:34:59 +0200 Subject: [PATCH 44/90] fix tests --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 83c752b6efc..52b7dca5bf7 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -147,11 +147,13 @@ def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) umol = sel.convert_to("RDKIT") atom = umol.GetAtomWithIdx(atom_index) + mda_index = np.where( + sel.indices == atom.GetIntProp("_MDAnalysis_index")) mi = atom.GetMonomerInfo() for mda_attr, rd_attr in RDATTRIBUTES.items(): rd_value = getattr(mi, "Get%s" % rd_attr)() - mda_value = getattr(sel, "%s" % mda_attr)[atom_index] + mda_value = getattr(sel, "%s" % mda_attr)[mda_index] if mda_attr == "names": rd_value = rd_value.strip() assert rd_value == mda_value @@ -223,6 +225,7 @@ def test_index_property(self, pdb, sel_str): expected = ag.indices indices = np.array([a.GetIntProp("_MDAnalysis_index") for a in mol.GetAtoms()], dtype=np.int32) + indices.sort() assert_equal(indices, expected) From 7168e2d6cd40d69a3f64041a97b87abc72a1c1e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 13 Jul 2020 19:36:12 +0200 Subject: [PATCH 45/90] add previously failing tests --- .../MDAnalysisTests/coordinates/test_rdkit.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 52b7dca5bf7..2b5e3f4bcec 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -24,7 +24,8 @@ import pytest import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element -from MDAnalysis.coordinates.RDKit import _infer_bo_and_charges +from MDAnalysis.coordinates.RDKit import (_infer_bo_and_charges, + _standardize_patterns) import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) @@ -143,7 +144,6 @@ def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): ("resid 34 and altloc B", 2), ]) def test_monomer_info(self, pdb, sel_str, atom_index): - rdmol = Chem.MolFromPDBFile(PDB_full) sel = pdb.select_atoms(sel_str) umol = sel.convert_to("RDKIT") atom = umol.GetAtomWithIdx(atom_index) @@ -244,14 +244,16 @@ class TestRDKitFunctions(object): ("[C](-[H])-[C](-[H])-[H]", [0], "[H][C]=C([H])[H]"), ("[C](-[H])-[Cl]", [0], "[H][C]Cl"), ("[C](-[O])-[Cl]", [0], "O=[C]Cl"), - #("[S](-[O])(-[O])(-[O]-C)-C", [], "COS(=O)(=O)C"), + ("[S](-[O])(-[O])(-[O]-C)-C", [], "COS(=O)(=O)C"), #("[S](-[O])(-[O])-C", [0], "O=[S](=O)C"), - #("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", [], "CNC(N)=[N+](-[H])-[H]"), + ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", + [], "CNC(N)=[N+](-[H])-[H]"), ]) def test_infer_bond_orders(self, smi, edges, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) _infer_bo_and_charges(mol, edges) + mol = _standardize_patterns(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) @@ -262,12 +264,19 @@ def test_infer_bond_orders(self, smi, edges, out): ("C-[O]", "O", -1), ("[N]-[C]-[O]", "O", -1), ("[N](-[H])(-[H])(-[H])-[H]", "N", 1), - #("[O]-[C](-[H])-[C](-[H])-[H]", "O", -1), + ("[O]-[C](-[H])-[C](-[H])-[H]", "O", -1), ]) def test_infer_charges(self, smi, atom, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol) Chem.SanitizeMol(mol) index = mol.GetSubstructMatch(Chem.MolFromSmarts(atom))[0] assert mol.GetAtomWithIdx(index).GetFormalCharge() == charge + + def test_standardize_patterns(self): + pass + + def test_set_atom_property(self): + pass From d1f06f3f6d65f14e18e66b1b215c18d9f272d929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 14 Jul 2020 22:50:52 +0200 Subject: [PATCH 46/90] simpler pattern standardization --- package/MDAnalysis/coordinates/RDKit.py | 117 ++++++++++++++---------- 1 file changed, 69 insertions(+), 48 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 18cb4fd9dd4..af1c063b556 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -431,54 +431,75 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): def _standardize_patterns(mol): """Standardize functional groups using reactions from SMARTS patterns - Because the two NH2 groups in arginine are identical, during the reaction the pattern is matched 2 times and produces 2 identical products per ARG residue, which results in as many molecules as there are ARG residues when we use Chem.CombineMols. For this reason, we have to run the ARG reaction and keep the first product as many times as their are arginine residues in the molecule. - The same logic applies to all reactions that imply several identical - moieties (like the two =O in sulfones). + Due to the way reactions work, we first have to split the molecule by + fragments. Then, for each fragment, we apply the standardization reactions. + If a pattern is matched N times in the molecule, the reaction will return N + products as an array of shape (N, 1). Only the first product will be kept + and the same reaction will be reapplied to the product N times in total. + Finally, the fragments are recombined. """ - reactions = [ - ("Cterm", "[C-;v3:1]=[O:2]>>[C;+0:1]=[O:2]"), - ("Nterm", "[N-;v2;H1:1]>>[N;+0:1]"), - ("keto-enolate", "[C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3]"), - ] - # arginine - pattern = Chem.MolFromSmarts("[N;H1:1]-[C-;v3:2](-[N;H2:3])-[N;H2:4]") - n = len(mol.GetSubstructMatches(pattern)) - reactions.extend( - n * [("ARG", "[N;H1:1]-[C-;v3:2](-[N;H2:3])-[N;H2:4]" - ">>[N:1]-[C;+0:2](-[N:3])=[N;+1:4]")] - ) - # sulfone - pattern = Chem.MolFromSmarts("[S;v4:1](-[O-;v1:2])-[O-;v1:3]") - n = len(mol.GetSubstructMatches(pattern)) - reactions.extend( - n * [("sulfone", "[S;v4:1](-[O-;v1:2])-[O-;v1:3]" - ">>[S;v6:1](=[O;+0:2])=[O;+0:3]")] - ) - for name, reaction in reactions: - rxn = AllChem.ReactionFromSmarts(reaction) - products = rxn.RunReactants((mol,)) - # product is an empty tuple if there were no matching atoms in the mol - if products: - if name in ["ARG", "sulfone"]: - products = products[::len(products)] - product = products[0][0] - for p in products[1:]: - product = Chem.CombineMols(product, p[0]) - # map back atomic properties to the transformed atoms - for atom in product.GetAtoms(): - try: - atom.GetIntProp("old_mapno") - except KeyError: - pass + + fragments = [] + for reactant in Chem.GetMolFrags(mol, asMols=True): + + for name, reaction in [ + ("Cterm", "[C-;v3:1]=[O:2]>>[C;+0:1]=[O:2]"), + ("Nterm", "[N-;v2;H1:1]>>[N;+0:1]"), + ("keto-enolate", "[C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3]"), + ("ARG", "[N;H1:1]-[C-;v3:2](-[N;H2:3])-[N;H2:4]" + ">>[N:1]-[C;+0:2](-[N:3])=[N;+1:4]"), + ("sulfone", "[S;v4:1](-[O-;v1:2])-[O-;v1:3]" + ">>[S;v6:1](=[O;+0:2])=[O;+0:3]"), + ("nitro", "[N;v3:1](-[O-;v1:2])-[O-;v1:3]" + ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), + ]: + # count how many times the reaction should be run + pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) + n_matches = len(reactant.GetSubstructMatches(pattern)) + + # run the reaction for each matched pattern + rxn = AllChem.ReactionFromSmarts(reaction) + for n in range(n_matches): + products = rxn.RunReactants((reactant,)) + # only keep the first product + if products: + product = products[0][0] + product.UpdatePropertyCache(strict=False) + # make sure each atom in the product has its atom properties + _reassign_props_after_reaction(reactant, product) + # apply the next reaction to the product + reactant = product else: - atom.ClearProp("old_mapno") - idx = atom.GetUnsignedProp("react_atom_idx") - old_atom = mol.GetAtomWithIdx(idx) - for prop, value in old_atom.GetPropsAsDict().items(): - if prop.startswith("_MDAnalysis"): - attr = prop.split("_")[-1] - _set_atom_property(atom, attr, value) - atom.ClearProp("react_atom_idx") - mol = product - mol.UpdatePropertyCache(strict=False) + # exit the n_matches loop if there's no product. Example + # where this is needed: SO^{4}_{2-} will match the sulfone + # pattern 6 times but the reaction is only needed once + break + + fragments.append(reactant) + + # recombine fragments + mol = fragments.pop(0) + for fragment in fragments: + mol = Chem.CombineMols(mol, fragment) + return mol + + +def _reassign_props_after_reaction(reactant, product): + """Maps back atomic properties from the reactant to the product. + The product molecule is modified inplace. + """ + for atom in product.GetAtoms(): + try: + atom.GetIntProp("old_mapno") + except KeyError: + pass + else: + atom.ClearProp("old_mapno") + idx = atom.GetUnsignedProp("react_atom_idx") + old_atom = reactant.GetAtomWithIdx(idx) + for prop, value in old_atom.GetPropsAsDict().items(): + if prop.startswith("_MDAnalysis"): + attr = prop.split("_")[-1] + _set_atom_property(atom, attr, value) + atom.ClearProp("react_atom_idx") From 5a661f8a7dd4658a094cd9e3303f3d7e8f25d3ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 14 Jul 2020 22:51:30 +0200 Subject: [PATCH 47/90] more tests on inferring and patterns --- .../MDAnalysisTests/coordinates/test_rdkit.py | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 2b5e3f4bcec..8d3cc24fe3d 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -233,50 +233,71 @@ def test_index_property(self, pdb, sel_str): class TestRDKitFunctions(object): @pytest.mark.parametrize("smi, edges, out", [ ("C(-[H])(-[H])(-[H])-[H]", [], "C"), - ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", [], "c1ccccc1"), - ("[Cl]-[C](-[H])-[O]", [], "C(=O)Cl"), - ("[H]-[C](-[O])-[N](-[H])-[H]", [], "C(=O)N"), ("[C](-[H])(-[H])-[C](-[H])-[H]", [], "C=C"), - ("[P](-O)(-O)(-O)-[O]", [], "P(O)(O)(O)=O"), - ("[P](-[O])(-[O])(-[O])-O", [], "O=P([O-])([O-])O"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", [], + "c1ccccc1"), + ("C-[C](-[H])-[O]", [], "C(=O)C"), + ("[H]-[C](-[O])-[N](-[H])-[H]", [], "C(=O)N"), ("[N]-[C]-[H]", [], "N#C"), + ("C-[C](-[O]-[H])-[O]", [], "CC(=O)O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", [], "P(O)(O)(O)=O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", [], "P([O-])(O)(O)=O"), + ("[P](-[O]-[H])(-[O])(-[O])-[O]", [], "P([O-])([O-])(O)=O"), + ("[P](-[O])(-[O])(-[O])-[O]", [], "P([O-])([O-])([O-])=O"), + ("[H]-[O]-[N]-[O]", [], "ON=O"), + ("[N]-[C]-[O]", [], "N#C[O-]"), ("[C](-[H])(-[H])-[Cl]", [0], "[H][C]([H])Cl"), ("[C](-[H])-[C](-[H])-[H]", [0], "[H][C]=C([H])[H]"), ("[C](-[H])-[Cl]", [0], "[H][C]Cl"), ("[C](-[O])-[Cl]", [0], "O=[C]Cl"), - ("[S](-[O])(-[O])(-[O]-C)-C", [], "COS(=O)(=O)C"), - #("[S](-[O])(-[O])-C", [0], "O=[S](=O)C"), - ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", - [], "CNC(N)=[N+](-[H])-[H]"), ]) def test_infer_bond_orders(self, smi, edges, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) _infer_bo_and_charges(mol, edges) - mol = _standardize_patterns(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert mol.HasSubstructMatch( - molref) and molref.HasSubstructMatch(mol) - - @pytest.mark.parametrize("smi, atom, charge", [ - ("C-[O]", "O", -1), - ("[N]-[C]-[O]", "O", -1), - ("[N](-[H])(-[H])(-[H])-[H]", "N", 1), - ("[O]-[C](-[H])-[C](-[H])-[H]", "O", -1), + assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( + mol), "{} != {}".format(Chem.MolToSmiles(mol), out) + + @pytest.mark.parametrize("smi, atom_idx, charge", [ + ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), + ("[N]-[C]-[O]", 2, -1), + ("[N](-[H])(-[H])(-[H])-[H]", 0, 1), + ("C-[C](-[O])-[O]", 3, -1), ]) - def test_infer_charges(self, smi, atom, charge): + def test_infer_charges(self, smi, atom_idx, charge): + mol = Chem.MolFromSmiles(smi, sanitize=False) + mol.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(mol) + Chem.SanitizeMol(mol) + assert mol.GetAtomWithIdx(atom_idx).GetFormalCharge() == charge + + @pytest.mark.parametrize("smi, out", [ + ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), + ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), + ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), + ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", + "CNC(N)=[N+](-[H])-[H]"), + ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), + ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), + ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), + ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), + ]) + def test_standardize_patterns(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) _infer_bo_and_charges(mol) mol = _standardize_patterns(mol) Chem.SanitizeMol(mol) - index = mol.GetSubstructMatch(Chem.MolFromSmarts(atom))[0] - assert mol.GetAtomWithIdx(index).GetFormalCharge() == charge + mol = Chem.RemoveHs(mol) + molref = Chem.MolFromSmiles(out) + assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( + mol), "{} != {}".format(Chem.MolToSmiles(mol), out) - def test_standardize_patterns(self): + def test_set_atom_property(self): pass - def test_set_atom_property(self): + def test_reassign_props_after_reaction(self): pass From 6e9c221961f1c6912ef5e34521224edec91abafd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 14 Jul 2020 23:56:21 +0200 Subject: [PATCH 48/90] test atom properties and reaction mapping --- .../MDAnalysisTests/coordinates/test_rdkit.py | 78 +++++++++++++++++-- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 8d3cc24fe3d..1ca35a0d90f 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -25,7 +25,9 @@ import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element from MDAnalysis.coordinates.RDKit import (_infer_bo_and_charges, - _standardize_patterns) + _standardize_patterns, + _set_atom_property, + _reassign_props_after_reaction) import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) @@ -296,8 +298,74 @@ def test_standardize_patterns(self, smi, out): assert mol.HasSubstructMatch(molref) and molref.HasSubstructMatch( mol), "{} != {}".format(Chem.MolToSmiles(mol), out) - def test_set_atom_property(self): - pass + @pytest.mark.parametrize("attr, value, getter", [ + ("index", 42, "GetIntProp"), + ("index", np.int(42), "GetIntProp"), + ("charge", 4.2, "GetDoubleProp"), + ("charge", np.float(4.2), "GetDoubleProp"), + ("type", "C.3", "GetProp"), + ]) + def test_set_atom_property(self, attr, value, getter): + atom = Chem.Atom(1) + _set_atom_property(atom, attr, value) + assert getattr(atom, getter)("_MDAnalysis_%s" % attr) == value + + def dummy_product(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetIntProp("old_mapno", 0) + atom.SetUnsignedProp("react_atom_idx", 0) + mol.AddAtom(atom) + return mol - def test_reassign_props_after_reaction(self): - pass + def dummy_product_nomap(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetUnsignedProp("react_atom_idx", 0) + mol.AddAtom(atom) + return mol + + def dummy_reactant_noprops(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + mol.AddAtom(atom) + return mol + + def dummy_reactant(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetProp("foo", "bar") + atom.SetIntProp("_MDAnalysis_index", 1) + atom.SetDoubleProp("_MDAnalysis_charge", 4.2) + atom.SetProp("_MDAnalysis_type", "C.3") + mol.AddAtom(atom) + return mol + + @pytest.mark.parametrize("reactant, product, name", [ + (dummy_reactant(), dummy_product(), "props"), + (dummy_reactant_noprops(), dummy_product(), "noprops"), + (dummy_reactant(), dummy_product_nomap(), "nomap"), + ]) + def test_reassign_props_after_reaction(self, reactant, product, name): + _reassign_props_after_reaction(reactant, product) + atom = product.GetAtomWithIdx(0) + if name == "props": + with pytest.raises(KeyError, match="foo"): + atom.GetProp("foo") + assert atom.GetIntProp("_MDAnalysis_index") == 1 + assert atom.GetDoubleProp("_MDAnalysis_charge") == 4.2 + assert atom.GetProp("_MDAnalysis_type") == "C.3" + with pytest.raises(KeyError, match="old_mapno"): + atom.GetIntProp("old_mapno") + with pytest.raises(KeyError, match="react_atom_idx"): + atom.GetUnsignedProp("react_atom_idx") + elif name == "noprops": + with pytest.raises(KeyError, match="old_mapno"): + atom.GetIntProp("old_mapno") + with pytest.raises(KeyError, match="react_atom_idx"): + atom.GetUnsignedProp("react_atom_idx") + elif name == "nomap": + with pytest.raises(KeyError, match="react_atom_idx"): + atom.GetUnsignedProp("react_atom_idx") + with pytest.raises(KeyError, match="_MDAnalysis_index"): + atom.GetIntProp("_MDAnalysis_index") From b02fef62ccea9ff8c65eb3bf9712f8543fe8e43c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 15 Jul 2020 01:09:48 +0200 Subject: [PATCH 49/90] fix docs --- package/MDAnalysis/coordinates/RDKit.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index af1c063b556..418feb79f1b 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -142,7 +142,7 @@ def __init__(self, filename, **kwargs): class RDKitConverter(base.ConverterBase): - """Convert MDAnalysis AtomGroup or Universe to `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` + """Convert MDAnalysis :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` to `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` MDanalysis attributes are stored in each RDKit atom of the resulting molecule in two different ways: @@ -198,6 +198,15 @@ class RDKitConverter(base.ConverterBase): mol = u.select_atoms('resname DMS').convert_to('RDKIT') + Notes + ----- + + The converter requires the :class:`~MDAnalysis.core.topologyattrs.Elements` attribute + to be present in the topology, else it will fail. + It also requires the `bonds` attribute, although they will be automatically + guessed if not present. + + .. versionadded:: 2.0.0 .. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol @@ -206,12 +215,15 @@ class RDKitConverter(base.ConverterBase): lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} - def convert(self, obj): + def convert(self, obj, NoImplicit=True): """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. Parameters ----------- obj : AtomGroup or Universe + + NoImplicit : bool + Prevent adding hydrogens to the molecule """ try: from rdkit import Chem @@ -241,20 +253,21 @@ def convert(self, obj): for attr in RDATTRIBUTES.keys(): if hasattr(ag, attr): pdb_attrs[attr] = getattr(ag, attr) - # others + other_attrs = {} for attr in ["bfactors", "charges", "segids", "types"]: if hasattr(ag, attr): other_attrs[attr] = getattr(ag, attr) mol = Chem.RWMol() + # map index in universe to index in mol atom_mapper = {} for i, (atom, element) in enumerate(zip(ag, elements)): # create atom rdatom = Chem.Atom(element.capitalize()) # disable adding H to the molecule - rdatom.SetNoImplicit(True) + rdatom.SetNoImplicit(NoImplicit) # add PDB-like properties mi = Chem.AtomPDBResidueInfo() for attr, values in pdb_attrs.items(): @@ -268,7 +281,6 @@ def convert(self, obj): _set_atom_property(rdatom, "index", int(atom.ix)) # add atom index = mol.AddAtom(rdatom) - # map index in universe to index in mol atom_mapper[atom.ix] = index try: From c8acda0f4875f26deca6c70d0aaf0e1b25d22682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 15 Jul 2020 11:59:04 +0200 Subject: [PATCH 50/90] fix minimal build tests --- .../MDAnalysisTests/coordinates/test_rdkit.py | 86 +++++++++++-------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 1ca35a0d90f..b65023ec2bc 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -24,10 +24,6 @@ import pytest import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element -from MDAnalysis.coordinates.RDKit import (_infer_bo_and_charges, - _standardize_patterns, - _set_atom_property, - _reassign_props_after_reaction) import numpy as np from numpy.testing import (assert_equal, assert_almost_equal) @@ -49,13 +45,31 @@ def test_converter_requires_rdkit(self): from rdkit import Chem from rdkit.Chem import AllChem from MDAnalysis.coordinates.RDKit import ( - RDATTRIBUTES, _add_mda_attr_to_rdkit) + RDATTRIBUTES, + _add_mda_attr_to_rdkit, + _infer_bo_and_charges, + _standardize_patterns, + _set_atom_property, + _reassign_props_after_reaction + ) except ImportError: def mol2_mol(): pass def smiles_mol(): pass + + def dummy_product(): + pass + + def dummy_product_nomap(): + pass + + def dummy_reactant(): + pass + + def dummy_reactant_noprops(): + pass else: def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) @@ -65,6 +79,37 @@ def smiles_mol(): mol = Chem.AddHs(mol) cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) return mol + + def dummy_product(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetIntProp("old_mapno", 0) + atom.SetUnsignedProp("react_atom_idx", 0) + mol.AddAtom(atom) + return mol + + def dummy_product_nomap(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetUnsignedProp("react_atom_idx", 0) + mol.AddAtom(atom) + return mol + + def dummy_reactant_noprops(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + mol.AddAtom(atom) + return mol + + def dummy_reactant(): + mol = Chem.RWMol() + atom = Chem.Atom(1) + atom.SetProp("foo", "bar") + atom.SetIntProp("_MDAnalysis_index", 1) + atom.SetDoubleProp("_MDAnalysis_charge", 4.2) + atom.SetProp("_MDAnalysis_type", "C.3") + mol.AddAtom(atom) + return mol requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), @@ -310,37 +355,6 @@ def test_set_atom_property(self, attr, value, getter): _set_atom_property(atom, attr, value) assert getattr(atom, getter)("_MDAnalysis_%s" % attr) == value - def dummy_product(): - mol = Chem.RWMol() - atom = Chem.Atom(1) - atom.SetIntProp("old_mapno", 0) - atom.SetUnsignedProp("react_atom_idx", 0) - mol.AddAtom(atom) - return mol - - def dummy_product_nomap(): - mol = Chem.RWMol() - atom = Chem.Atom(1) - atom.SetUnsignedProp("react_atom_idx", 0) - mol.AddAtom(atom) - return mol - - def dummy_reactant_noprops(): - mol = Chem.RWMol() - atom = Chem.Atom(1) - mol.AddAtom(atom) - return mol - - def dummy_reactant(): - mol = Chem.RWMol() - atom = Chem.Atom(1) - atom.SetProp("foo", "bar") - atom.SetIntProp("_MDAnalysis_index", 1) - atom.SetDoubleProp("_MDAnalysis_charge", 4.2) - atom.SetProp("_MDAnalysis_type", "C.3") - mol.AddAtom(atom) - return mol - @pytest.mark.parametrize("reactant, product, name", [ (dummy_reactant(), dummy_product(), "props"), (dummy_reactant_noprops(), dummy_product(), "noprops"), From 9defa40f1dde08f7c08c61ca91e8561b4a26b7ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 15 Jul 2020 12:06:13 +0200 Subject: [PATCH 51/90] pep8 --- package/MDAnalysis/coordinates/RDKit.py | 8 ++++---- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 418feb79f1b..05793a0b8c5 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -202,7 +202,7 @@ class RDKitConverter(base.ConverterBase): ----- The converter requires the :class:`~MDAnalysis.core.topologyattrs.Elements` attribute - to be present in the topology, else it will fail. + to be present in the topology, else it will fail. It also requires the `bonds` attribute, although they will be automatically guessed if not present. @@ -447,7 +447,7 @@ def _standardize_patterns(mol): fragments. Then, for each fragment, we apply the standardization reactions. If a pattern is matched N times in the molecule, the reaction will return N products as an array of shape (N, 1). Only the first product will be kept - and the same reaction will be reapplied to the product N times in total. + and the same reaction will be reapplied to the product N times in total. Finally, the fragments are recombined. """ @@ -477,7 +477,7 @@ def _standardize_patterns(mol): if products: product = products[0][0] product.UpdatePropertyCache(strict=False) - # make sure each atom in the product has its atom properties + # map back atom properties from the reactant to the product _reassign_props_after_reaction(reactant, product) # apply the next reaction to the product reactant = product @@ -498,7 +498,7 @@ def _standardize_patterns(mol): def _reassign_props_after_reaction(reactant, product): - """Maps back atomic properties from the reactant to the product. + """Maps back atomic properties from the reactant to the product. The product molecule is modified inplace. """ for atom in product.GetAtoms(): diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index b65023ec2bc..3c535e70e8d 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -45,13 +45,13 @@ def test_converter_requires_rdkit(self): from rdkit import Chem from rdkit.Chem import AllChem from MDAnalysis.coordinates.RDKit import ( - RDATTRIBUTES, + RDATTRIBUTES, _add_mda_attr_to_rdkit, _infer_bo_and_charges, _standardize_patterns, _set_atom_property, - _reassign_props_after_reaction - ) + _reassign_props_after_reaction, + ) except ImportError: def mol2_mol(): pass @@ -79,7 +79,7 @@ def smiles_mol(): mol = Chem.AddHs(mol) cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) return mol - + def dummy_product(): mol = Chem.RWMol() atom = Chem.Atom(1) From 39ae42bd6b461966e72dc4d1a7b67ee01c2ca962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 15 Jul 2020 17:13:53 +0200 Subject: [PATCH 52/90] bfactors as TempFactor --- package/MDAnalysis/coordinates/RDKit.py | 13 ++++++++++-- .../MDAnalysisTests/coordinates/test_rdkit.py | 21 +++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 05793a0b8c5..5b0a1c1dcc9 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -94,6 +94,7 @@ "resids": "ResidueNumber", "segindices": "SegmentNumber", "tempfactors": "TempFactor", + "bfactors": "TempFactor", } PERIODIC_TABLE = Chem.GetPeriodicTable() @@ -176,7 +177,7 @@ class RDKitConverter(base.ConverterBase): +-----------------------+-------------------------------------------+ | tempfactors | atom.GetMonomerInfo().GetTempFactor() | +-----------------------+-------------------------------------------+ - | bfactors | atom.GetDoubleProp("_MDAnalysis_bfactor") | + | bfactors | atom.GetMonomerInfo().GetTempFactor() | +-----------------------+-------------------------------------------+ | charges | atom.GetDoubleProp("_MDAnalysis_charge") | +-----------------------+-------------------------------------------+ @@ -205,6 +206,8 @@ class RDKitConverter(base.ConverterBase): to be present in the topology, else it will fail. It also requires the `bonds` attribute, although they will be automatically guessed if not present. + If both `tempfactors` and `bfactors` attributes are present, the conversion + will fail, since only one of these should be present. .. versionadded:: 2.0.0 @@ -250,12 +253,18 @@ def convert(self, obj, NoImplicit=True): # attributes accepted in PDBResidueInfo object pdb_attrs = {} + if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): + raise AttributeError( + "Both `tempfactors` and `bfactors` attributes are present but " + "only one can be assigned to the RDKit molecule. Please " + "delete the unnecessary one and retry." + ) for attr in RDATTRIBUTES.keys(): if hasattr(ag, attr): pdb_attrs[attr] = getattr(ag, attr) other_attrs = {} - for attr in ["bfactors", "charges", "segids", "types"]: + for attr in ["charges", "segids", "types"]: if hasattr(ag, attr): other_attrs[attr] = getattr(ag, attr) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 3c535e70e8d..969b9890cb3 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -200,10 +200,11 @@ def test_monomer_info(self, pdb, sel_str, atom_index): for mda_attr, rd_attr in RDATTRIBUTES.items(): rd_value = getattr(mi, "Get%s" % rd_attr)() - mda_value = getattr(sel, "%s" % mda_attr)[mda_index] - if mda_attr == "names": - rd_value = rd_value.strip() - assert rd_value == mda_value + if hasattr(sel, mda_attr): + mda_value = getattr(sel, mda_attr)[mda_index] + if mda_attr == "names": + rd_value = rd_value.strip() + assert rd_value == mda_value def test_identical_topology_mol2(self, mol2): """Check stereochemistry on atoms and bonds (but not yet)""" @@ -246,6 +247,7 @@ def test_warn_guess_bonds(self, pdb): ("resids", 123, 123), ("segindices", 1, 1), ("tempfactors", 0.8, 0.8), + ("bfactors", 0.8, 0.8), ]) def test_add_mda_attr_to_rdkit(self, attr, value, expected): mi = Chem.AtomPDBResidueInfo() @@ -253,6 +255,17 @@ def test_add_mda_attr_to_rdkit(self, attr, value, expected): rdvalue = getattr(mi, "Get%s" % RDATTRIBUTES[attr])() assert rdvalue == expected + def test_bfactors_tempfactors_raises_error(self): + u = mda.Universe.from_smiles("C") + bfactors = np.array(u.atoms.n_atoms*[1.0], dtype=np.float32) + u.add_TopologyAttr('bfactors', bfactors) + u.add_TopologyAttr('tempfactors', bfactors) + with pytest.raises( + AttributeError, + match="Both `tempfactors` and `bfactors` attributes are present" + ): + u.atoms.convert_to("RDKIT") + @pytest.mark.parametrize("idx", [0, 10, 42]) def test_other_attributes(self, mol2, idx): mol = mol2.atoms.convert_to("RDKIT") From ff45d3b0360c8932bf5d59e9bb0cdcf1dcbceb43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 16 Jul 2020 00:37:11 +0200 Subject: [PATCH 53/90] Update CHANGELOG --- package/CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index c92fcfe9cec..547788c1396 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -47,8 +47,8 @@ Enhancements * Added converter between Cartesian and Bond-Angle-Torsion coordinates (PR #2668) * Added Hydrogen Bond Lifetime via existing autocorrelation features (PR #2791) * Added Hydrogen Bond Lifetime keyword "between" (PR #2791) - * Added a simple version of the RDKitConverter that handles uncharged - molecules with bond orders/bond types assigned (Issue #2468, PR #2775) + * Added a converter that works for any input with all hydrogens explicit in + the topology (Issue #2468, PR #2775) Changes * deprecated NumPy type aliases have been replaced with their actual types From 7d7f3f2c0003500ea4fdaf653724b7f2678ae193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 16 Jul 2020 18:32:56 +0200 Subject: [PATCH 54/90] pep8 --- package/MDAnalysis/coordinates/RDKit.py | 27 ++++++++++++++----------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 5b0a1c1dcc9..89d7d53fb3e 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -24,9 +24,9 @@ """RDKit molecule I/O --- :mod:`MDAnalysis.coordinates.RDKit` ================================================================ -Read coordinates data from an `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` -into an MDAnalysis Universe. Convert it back to an :class:`rdkit.Chem.rdchem.Mol` with -:class:`RDKitConverter`. +Read coordinates data from an `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` with +:class:`RDKitReader` into an MDAnalysis Universe. Convert it back to an +:class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. Example @@ -143,15 +143,17 @@ def __init__(self, filename, **kwargs): class RDKitConverter(base.ConverterBase): - """Convert MDAnalysis :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` to `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` + """Convert MDAnalysis :class:`~MDAnalysis.core.groups.AtomGroup` or + :class:`~MDAnalysis.core.universe.Universe` to `RDKit`_ + :class:`rdkit.Chem.rdchem.Mol` - MDanalysis attributes are stored in each RDKit atom of the resulting - molecule in two different ways: + MDanalysis attributes are stored in each RDKit atom of the resulting + molecule in two different ways: - * in an `AtomPDBResidueInfo` object available through the - ``atom.GetMonomerInfo()`` method if it's an attribute that is typically + * in an `AtomPDBResidueInfo` object available through the + ``atom.GetMonomerInfo()`` method if it's an attribute that is typically found in a PDB file, - * directly as an atom property available through the + * directly as an atom property available through the ``atom.GetPropsAsDict()`` method for the others. Supported attributes: @@ -202,8 +204,8 @@ class RDKitConverter(base.ConverterBase): Notes ----- - The converter requires the :class:`~MDAnalysis.core.topologyattrs.Elements` attribute - to be present in the topology, else it will fail. + The converter requires the :class:`~MDAnalysis.core.topologyattrs.Elements` + attribute to be present in the topology, else it will fail. It also requires the `bonds` attribute, although they will be automatically guessed if not present. If both `tempfactors` and `bfactors` attributes are present, the conversion @@ -219,7 +221,8 @@ class RDKitConverter(base.ConverterBase): units = {'time': None, 'length': 'Angstrom'} def convert(self, obj, NoImplicit=True): - """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. + """Write selection at current trajectory frame to + :class:`rdkit.Chem.rdchem.Mol`. Parameters ----------- From f3f6f01773bf279d7bb4a42ccade3b04306ab5af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 16 Jul 2020 18:41:45 +0200 Subject: [PATCH 55/90] pep8 --- package/MDAnalysis/coordinates/RDKit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 89d7d53fb3e..ff71792332e 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -221,7 +221,7 @@ class RDKitConverter(base.ConverterBase): units = {'time': None, 'length': 'Angstrom'} def convert(self, obj, NoImplicit=True): - """Write selection at current trajectory frame to + """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. Parameters From 5d342dbe039a5065a756de858bad6f6ba683caa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 16 Jul 2020 19:19:51 +0200 Subject: [PATCH 56/90] no hydrogen warning --- package/MDAnalysis/coordinates/RDKit.py | 9 +++++++++ testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index ff71792332e..2cd233c6d80 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -254,6 +254,15 @@ def convert(self, obj, NoImplicit=True): "documentation to guess elements from other attributes or " "type `help(mda.topology.guessers)`") from None + if "H" not in ag.elements: + warnings.warn( + "No hydrogen atom could be found in the topology, but the " + "converter requires all hydrogens to be explicit. Please " + "check carefully the output molecule as the converter is " + "likely to add negative charges and assign incorrect bond " + "orders to structures with implicit hydrogens." + ) + # attributes accepted in PDBResidueInfo object pdb_attrs = {} if hasattr(ag, "bfactors") and hasattr(ag, "tempfactors"): diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 969b9890cb3..02c5e769971 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -235,6 +235,14 @@ def test_warn_guess_bonds(self, pdb): match="No `bonds` attribute in this AtomGroup"): ag.convert_to("RDKIT") + def test_warn_no_hydrogen(self): + u = mda.Universe.from_smiles("O=O") + with pytest.warns( + UserWarning, + match="No hydrogen atom could be found in the topology" + ): + u.atoms.convert_to("RDKIT") + @pytest.mark.parametrize("attr, value, expected", [ ("names", "C1", " C1 "), ("names", "C12", " C12"), From eff29a2ca43a5d79348706a680b1d8d0188d08ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 17 Jul 2020 19:03:19 +0200 Subject: [PATCH 57/90] quickfix for polycyclic conjugated systems --- package/MDAnalysis/coordinates/RDKit.py | 27 ++++++++------ .../MDAnalysisTests/coordinates/test_rdkit.py | 35 +++++++++++++++++++ 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 2cd233c6d80..3a4ec1143f3 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -449,16 +449,16 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): current_v = atom.GetTotalValence() nue = [v - current_v for v in expected_vs] - # if the atom still has unpaired electrons - current_v = atom.GetTotalValence() - nue = [v - current_v for v in expected_vs][0] - if nue > 0: - # keep the radical if it's a terminal atom - # else transform it to a negative charge - if atom.GetIdx() not in terminal_atom_indices: - atom.SetFormalCharge(-nue) - atom.SetNumRadicalElectrons(0) - mol.UpdatePropertyCache(strict=False) + # if the atom still has unpaired electrons + current_v = atom.GetTotalValence() + nue = [v - current_v for v in expected_vs][0] + if nue > 0: + # keep the radical if it's a terminal atom + # else transform it to a negative charge + if atom.GetIdx() not in terminal_atom_indices: + atom.SetFormalCharge(-nue) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) def _standardize_patterns(mol): @@ -485,6 +485,13 @@ def _standardize_patterns(mol): ">>[S;v6:1](=[O;+0:2])=[O;+0:3]"), ("nitro", "[N;v3:1](-[O-;v1:2])-[O-;v1:3]" ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), + ("anion-*=*-anion", "[*-:1]-[*:2]=[*:3]-[*-:4]" + ">>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]"), + ("anion-*=*-*=*-anion", "[*-:1][*:2]=[*:3][*:4]=[*:5][*-:6]" + ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*;+0:6]"), + ("anion-*=*-*=*-*=*-anion", + "[*-:1][*:2]=[*:3][*:4]=[*:5][*:6]=[*:7][*-:8]" + ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]-[*:7]=[*;+0:8]"), ]: # count how many times the reaction should be run pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 02c5e769971..d7ea5e3833a 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -404,3 +404,38 @@ def test_reassign_props_after_reaction(self, reactant, product, name): atom.GetUnsignedProp("react_atom_idx") with pytest.raises(KeyError, match="_MDAnalysis_index"): atom.GetIntProp("_MDAnalysis_index") + + @pytest.mark.parametrize("smi_in", [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", + #"c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + ]) + def test_order_independant(self, smi_in): + # generate mol with hydrogens but without bond orders + ref = Chem.MolFromSmiles(smi_in) + template = Chem.AddHs(ref) + for atom in template.GetAtoms(): + atom.SetIsAromatic(False) + for bond in template.GetBonds(): + bond.SetIsAromatic(False) + bond.SetBondType(Chem.BondType.SINGLE) + Chem.SanitizeMol(template) + # go through each possible starting atom + for a in template.GetAtoms(): + smi = Chem.MolToSmiles(template, rootedAtAtom=a.GetIdx()) + m = Chem.MolFromSmiles(smi, sanitize=False) + for atom in m.GetAtoms(): + atom.SetNoImplicit(True) + m.UpdatePropertyCache(strict=False) + _infer_bo_and_charges(m) + m = _standardize_patterns(m) + Chem.SanitizeMol(m) + m = Chem.RemoveHs(m) + assert m.HasSubstructMatch(ref) and ref.HasSubstructMatch( + m), "Failed when starting from atom %s%d" % ( + a.GetSymbol(), a.GetIndex()) From ca077969babba91d6c37503fe505a7e5462df028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 20 Jul 2020 11:31:32 +0200 Subject: [PATCH 58/90] save any atom property instead of just the ones tagged with "_MDAnalysis" --- package/MDAnalysis/coordinates/RDKit.py | 16 +++++++--------- .../MDAnalysisTests/coordinates/test_rdkit.py | 8 ++++---- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 3a4ec1143f3..4b1daad35be 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -297,9 +297,9 @@ def convert(self, obj, NoImplicit=True): # other properties for attr in other_attrs.keys(): value = other_attrs[attr][i] - attr = _TOPOLOGY_ATTRS[attr].singular + attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular _set_atom_property(rdatom, attr, value) - _set_atom_property(rdatom, "index", int(atom.ix)) + _set_atom_property(rdatom, "_MDAnalysis_index", int(atom.ix)) # add atom index = mol.AddAtom(rdatom) atom_mapper[atom.ix] = index @@ -374,13 +374,13 @@ def _add_mda_attr_to_rdkit(attr, value, mi): def _set_atom_property(atom, attr, value): - """Converts an MDAnalysis atom attribute into an RDKit atom property""" + """Saves any attribute and value into an RDKit atom property""" if isinstance(value, (float, np.float)): - atom.SetDoubleProp("_MDAnalysis_%s" % attr, float(value)) + atom.SetDoubleProp(attr, float(value)) elif isinstance(value, (int, np.int)): - atom.SetIntProp("_MDAnalysis_%s" % attr, int(value)) + atom.SetIntProp(attr, int(value)) else: - atom.SetProp("_MDAnalysis_%s" % attr, value) + atom.SetProp(attr, value) def _infer_bo_and_charges(mol, terminal_atom_indices=[]): @@ -539,7 +539,5 @@ def _reassign_props_after_reaction(reactant, product): idx = atom.GetUnsignedProp("react_atom_idx") old_atom = reactant.GetAtomWithIdx(idx) for prop, value in old_atom.GetPropsAsDict().items(): - if prop.startswith("_MDAnalysis"): - attr = prop.split("_")[-1] - _set_atom_property(atom, attr, value) + _set_atom_property(atom, prop, value) atom.ClearProp("react_atom_idx") diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index d7ea5e3833a..a0139add165 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -373,8 +373,9 @@ def test_standardize_patterns(self, smi, out): ]) def test_set_atom_property(self, attr, value, getter): atom = Chem.Atom(1) - _set_atom_property(atom, attr, value) - assert getattr(atom, getter)("_MDAnalysis_%s" % attr) == value + prop = "_MDAnalysis_%s" % attr + _set_atom_property(atom, prop, value) + assert getattr(atom, getter)(prop) == value @pytest.mark.parametrize("reactant, product, name", [ (dummy_reactant(), dummy_product(), "props"), @@ -385,8 +386,7 @@ def test_reassign_props_after_reaction(self, reactant, product, name): _reassign_props_after_reaction(reactant, product) atom = product.GetAtomWithIdx(0) if name == "props": - with pytest.raises(KeyError, match="foo"): - atom.GetProp("foo") + assert atom.GetProp("foo") == "bar" assert atom.GetIntProp("_MDAnalysis_index") == 1 assert atom.GetDoubleProp("_MDAnalysis_charge") == 4.2 assert atom.GetProp("_MDAnalysis_type") == "C.3" From 28d04ed17562117ac9cab59d1a6ed90e67474aef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 21 Jul 2020 20:41:14 +0200 Subject: [PATCH 59/90] general solution --- package/MDAnalysis/coordinates/RDKit.py | 80 ++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 7 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 4b1daad35be..b724b2fa26a 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -485,13 +485,13 @@ def _standardize_patterns(mol): ">>[S;v6:1](=[O;+0:2])=[O;+0:3]"), ("nitro", "[N;v3:1](-[O-;v1:2])-[O-;v1:3]" ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), - ("anion-*=*-anion", "[*-:1]-[*:2]=[*:3]-[*-:4]" - ">>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]"), - ("anion-*=*-*=*-anion", "[*-:1][*:2]=[*:3][*:4]=[*:5][*-:6]" - ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*;+0:6]"), - ("anion-*=*-*=*-*=*-anion", - "[*-:1][*:2]=[*:3][*:4]=[*:5][*:6]=[*:7][*-:8]" - ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]-[*:7]=[*;+0:8]"), + # ("anion-*=*-anion", "[*-:1]-[*:2]=[*:3]-[*-:4]" + # ">>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]"), + # ("anion-*=*-*=*-anion", "[*-:1][*:2]=[*:3][*:4]=[*:5][*-:6]" + # ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*;+0:6]"), + # ("anion-*=*-*=*-*=*-anion", + # "[*-:1][*:2]=[*:3][*:4]=[*:5][*:6]=[*:7][*-:8]" + # ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]-[*:7]=[*;+0:8]"), ]: # count how many times the reaction should be run pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) @@ -522,8 +522,70 @@ def _standardize_patterns(mol): for fragment in fragments: mol = Chem.CombineMols(mol, fragment) + # fix successive single and double bonds + global conjugated_anion_smarts + conjugated_anion_smarts = Chem.MolFromSmarts("[*-:1][*;+0:2]=[*;+0;!O:3]") + mol = _iterative_conjugated_bonds(mol) + return mol +from IPython.display import display +def _iterative_conjugated_bonds(mol, max_iter=50): + rxn = AllChem.ReactionFromSmarts( + "[*-:1][*;+0:2]=[*;+0;!O:3]>>[*;+0:1]=[*:2]-[*-:3]") + end_rxn = AllChem.ReactionFromSmarts( + "[*-:1]-[*:2]=[*:3]-[*-:4]>>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]") + # + previous_paths = [] + + for _ in range(max_iter): + end_prod = end_rxn.RunReactants((mol,)) + if end_prod: + product = end_prod[0][0] + _reassign_props_after_reaction(mol, product) + # continue if other groups that match the pattern are present + mol = product + continue + + products = rxn.RunReactants((mol,)) + if products: + product, path = _choose_optimal_product(products, previous_paths) + previous_paths.append(path) + _reassign_props_after_reaction(mol, product) + # keep going until we trigger the end_rxn + mol = product + continue + + # reactions not applicable to the molecule: all patterns were + # standardized or their was no pattern to standardize + return mol + + # iterative procedure could not be completed within max_iter iterations + warnings.warn("The standardization could not be completed within a " + "reasonable ammount of iterations") + return mol + + +def _choose_optimal_product(products, previous_paths): + # paths = [] + # for n, product in enumerate(products): + # mol = product[0] + # paths.append([]) + # anions = np.unique([m[0] + # for m in mol.GetSubstructMatches( + # conjugated_anion_smarts)]).tolist() + # for idx, i in enumerate(anions, start=1): + # for j in anions[idx:]: + # path = Chem.GetShortestPath(mol, i, j) + # if path not in previous_paths: + # paths[n].append(path) + # paths[n] = min(paths[n], key=len, default=[]) + # best = np.argmin([len(p) for p in paths]) + # if not paths[best]: + # best = np.random.randint(len(paths)) + # return products[best][0], paths[best] + return products[1][0], () + def _reassign_props_after_reaction(reactant, product): """Maps back atomic properties from the reactant to the product. @@ -540,4 +602,8 @@ def _reassign_props_after_reaction(reactant, product): old_atom = reactant.GetAtomWithIdx(idx) for prop, value in old_atom.GetPropsAsDict().items(): _set_atom_property(atom, prop, value) + # fix bonds with "crossed" stereo + for bond in atom.GetBonds(): + if bond.GetStereo() == Chem.BondStereo.STEREOANY: + bond.SetStereo(Chem.BondStereo.STEREONONE) atom.ClearProp("react_atom_idx") From ae73afffd52f0a7a76c175ec0e919394c6f5a614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Tue, 21 Jul 2020 21:17:14 +0200 Subject: [PATCH 60/90] back to old method for conjugated systems --- package/MDAnalysis/coordinates/RDKit.py | 147 +++++++----------- .../MDAnalysisTests/coordinates/test_rdkit.py | 5 +- 2 files changed, 60 insertions(+), 92 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index b724b2fa26a..1743a75de0b 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -466,12 +466,8 @@ def _standardize_patterns(mol): Due to the way reactions work, we first have to split the molecule by fragments. Then, for each fragment, we apply the standardization reactions. - If a pattern is matched N times in the molecule, the reaction will return N - products as an array of shape (N, 1). Only the first product will be kept - and the same reaction will be reapplied to the product N times in total. Finally, the fragments are recombined. """ - fragments = [] for reactant in Chem.GetMolFrags(mol, asMols=True): @@ -485,35 +481,25 @@ def _standardize_patterns(mol): ">>[S;v6:1](=[O;+0:2])=[O;+0:3]"), ("nitro", "[N;v3:1](-[O-;v1:2])-[O-;v1:3]" ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), - # ("anion-*=*-anion", "[*-:1]-[*:2]=[*:3]-[*-:4]" - # ">>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]"), - # ("anion-*=*-*=*-anion", "[*-:1][*:2]=[*:3][*:4]=[*:5][*-:6]" - # ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*;+0:6]"), - # ("anion-*=*-*=*-*=*-anion", - # "[*-:1][*:2]=[*:3][*:4]=[*:5][*:6]=[*:7][*-:8]" - # ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]-[*:7]=[*;+0:8]"), + ("conjugated1", "[*-:1]-[*:2]=[*:3]-[*-:4]" + ">>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]"), + ("conjugated2", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*-:6]" + ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*;+0:6]"), + ("conjugated3", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*:6]=[*:7]-[*-:8]" + ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]" + "-[*:7]=[*;+0:8]"), + ("conjugated4", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*:6]=[*:7]-[*:8]" + "=[*:9]-[*-:10]" + ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]" + "-[*:7]=[*:8]-[*:9]=[*;+0:10]"), + ("conjugated5", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*:6]=[*:7]-[*:8]" + "=[*:9]-[*:10]=[*:11]-[*-:12]" + ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]" + "-[*:7]=[*:8]-[*:9]=[*:10]-[*:11]=[*;+0:12]"), ]: - # count how many times the reaction should be run - pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) - n_matches = len(reactant.GetSubstructMatches(pattern)) - - # run the reaction for each matched pattern - rxn = AllChem.ReactionFromSmarts(reaction) - for n in range(n_matches): - products = rxn.RunReactants((reactant,)) - # only keep the first product - if products: - product = products[0][0] - product.UpdatePropertyCache(strict=False) - # map back atom properties from the reactant to the product - _reassign_props_after_reaction(reactant, product) - # apply the next reaction to the product - reactant = product - else: - # exit the n_matches loop if there's no product. Example - # where this is needed: SO^{4}_{2-} will match the sulfone - # pattern 6 times but the reaction is only needed once - break + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + reactant = _run_reaction(reaction, reactant) fragments.append(reactant) @@ -522,69 +508,50 @@ def _standardize_patterns(mol): for fragment in fragments: mol = Chem.CombineMols(mol, fragment) - # fix successive single and double bonds - global conjugated_anion_smarts - conjugated_anion_smarts = Chem.MolFromSmarts("[*-:1][*;+0:2]=[*;+0;!O:3]") - mol = _iterative_conjugated_bonds(mol) - return mol -from IPython.display import display -def _iterative_conjugated_bonds(mol, max_iter=50): - rxn = AllChem.ReactionFromSmarts( - "[*-:1][*;+0:2]=[*;+0;!O:3]>>[*;+0:1]=[*:2]-[*-:3]") - end_rxn = AllChem.ReactionFromSmarts( - "[*-:1]-[*:2]=[*:3]-[*-:4]>>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]") - # - previous_paths = [] - - for _ in range(max_iter): - end_prod = end_rxn.RunReactants((mol,)) - if end_prod: - product = end_prod[0][0] - _reassign_props_after_reaction(mol, product) - # continue if other groups that match the pattern are present - mol = product - continue - products = rxn.RunReactants((mol,)) - if products: - product, path = _choose_optimal_product(products, previous_paths) - previous_paths.append(path) - _reassign_props_after_reaction(mol, product) - # keep going until we trigger the end_rxn - mol = product - continue - - # reactions not applicable to the molecule: all patterns were - # standardized or their was no pattern to standardize - return mol - - # iterative procedure could not be completed within max_iter iterations - warnings.warn("The standardization could not be completed within a " - "reasonable ammount of iterations") - return mol +def _run_reaction(reaction, reactant): + """Runs a reaction until all reactants are transformed + If a pattern is matched N times in the molecule, the reaction will return N + products as an array of shape (N, 1). Only the first product will be kept + and the same reaction will be reapplied to the product N times in total. + + Parameters + ---------- -def _choose_optimal_product(products, previous_paths): - # paths = [] - # for n, product in enumerate(products): - # mol = product[0] - # paths.append([]) - # anions = np.unique([m[0] - # for m in mol.GetSubstructMatches( - # conjugated_anion_smarts)]).tolist() - # for idx, i in enumerate(anions, start=1): - # for j in anions[idx:]: - # path = Chem.GetShortestPath(mol, i, j) - # if path not in previous_paths: - # paths[n].append(path) - # paths[n] = min(paths[n], key=len, default=[]) - # best = np.argmin([len(p) for p in paths]) - # if not paths[best]: - # best = np.random.randint(len(paths)) - # return products[best][0], paths[best] - return products[1][0], () + reaction : str + SMARTS reaction + reactant : rdkit.Chem.rdchem.RWMol + The molecule to transform + + Returns + ------- + Final product of the reaction, as an rdkit.Chem.rdchem.RWMol + """ + # count how many times the reaction should be run + pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) + n_matches = len(reactant.GetSubstructMatches(pattern)) + + # run the reaction for each matched pattern + rxn = AllChem.ReactionFromSmarts(reaction) + for n in range(n_matches): + products = rxn.RunReactants((reactant,)) + # only keep the first product + if products: + product = products[0][0] + # map back atom properties from the reactant to the product + _reassign_props_after_reaction(reactant, product) + # apply the next reaction to the product + product.UpdatePropertyCache(strict=False) + reactant = product + else: + # exit the n_matches loop if there's no product. Example + # where this is needed: SO^{4}_{2-} will match the sulfone + # pattern 6 times but the reaction is only needed once + break + return reactant def _reassign_props_after_reaction(reactant, product): diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index a0139add165..b347b9dbe8a 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -413,7 +413,8 @@ def test_reassign_props_after_reaction(self, reactant, product, name): "c1csc(c1)-c1ccoc1-c1cc[nH]c1", "C1=C2C(=NC=N1)N=CN2", "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", - #"c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders @@ -424,7 +425,7 @@ def test_order_independant(self, smi_in): for bond in template.GetBonds(): bond.SetIsAromatic(False) bond.SetBondType(Chem.BondType.SINGLE) - Chem.SanitizeMol(template) + # go through each possible starting atom for a in template.GetAtoms(): smi = Chem.MolToSmiles(template, rootedAtAtom=a.GetIdx()) From d6cd44d923f430155fccc9687e0659788cbf6210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 22 Jul 2020 10:53:21 +0200 Subject: [PATCH 61/90] added "RDKit" in changelog --- package/CHANGELOG | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 2548413f9b4..bafd7c18798 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -53,8 +53,8 @@ Enhancements * Added Hydrogen Bond Lifetime via existing autocorrelation features (PR #2791) * Added Hydrogen Bond Lifetime keyword "between" (PR #2791) * Dead code removed from the TPR parser and increased test coverage (PR #2840) - * Added a converter that works for any input with all hydrogens explicit in - the topology (Issue #2468, PR #2775) + * Added an RDKit converter that works for any input with all hydrogens + explicit in the topology (Issue #2468, PR #2775) Changes * deprecated NumPy type aliases have been replaced with their actual types From 1398f8c19dab415565151e5ccb11a365e84d698c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 22 Jul 2020 11:21:53 +0200 Subject: [PATCH 62/90] fix when first atom is charged --- package/MDAnalysis/coordinates/RDKit.py | 9 +++++---- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 1743a75de0b..326908bea46 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -412,15 +412,16 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): for atom in mol.GetAtoms(): # get NUE for each possible valence expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) - current_v = atom.GetTotalValence() + current_v = atom.GetTotalValence() - atom.GetFormalCharge() nue = [v - current_v for v in expected_vs] - # if there's only one possible valence state and the corresponding # NUE is negative, it means we can only add a positive charge to # the atom if (len(nue) == 1) and (nue[0] < 0): atom.SetFormalCharge(-nue[0]) mol.UpdatePropertyCache(strict=False) + # go to next atom if above case or atom has no unpaired electron + if (len(nue) == 1) and (nue[0] <= 0): continue else: neighbors = atom.GetNeighbors() @@ -429,7 +430,7 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): # get NUE for the neighbor na_expected_vs = PERIODIC_TABLE.GetValenceList( na.GetAtomicNum()) - na_current_v = na.GetTotalValence() + na_current_v = na.GetTotalValence() - na.GetFormalCharge() na_nue = [v - na_current_v for v in na_expected_vs] # smallest common NUE common_nue = min( @@ -450,7 +451,7 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): nue = [v - current_v for v in expected_vs] # if the atom still has unpaired electrons - current_v = atom.GetTotalValence() + current_v = atom.GetTotalValence() - atom.GetFormalCharge() nue = [v - current_v for v in expected_vs][0] if nue > 0: # keep the radical if it's a terminal atom diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index b347b9dbe8a..1584925f2b8 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -415,6 +415,7 @@ def test_reassign_props_after_reaction(self, reactant, product, name): "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From e71a686ca4d0fb7b933d47d68dcac6b35233f1fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 22 Jul 2020 15:35:44 +0200 Subject: [PATCH 63/90] working general solution to conjugated systems --- package/MDAnalysis/coordinates/RDKit.py | 112 +++++++++++++++--- .../MDAnalysisTests/coordinates/test_rdkit.py | 8 ++ 2 files changed, 104 insertions(+), 16 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 326908bea46..97822ea5a46 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -463,12 +463,18 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): def _standardize_patterns(mol): - """Standardize functional groups using reactions from SMARTS patterns + """Standardizes functional groups + Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, + and SMARTS reactions for other functional groups. Due to the way reactions work, we first have to split the molecule by fragments. Then, for each fragment, we apply the standardization reactions. Finally, the fragments are recombined. """ + + # standardize conjugated systems + _rebuild_conjugated_bonds(mol) + fragments = [] for reactant in Chem.GetMolFrags(mol, asMols=True): @@ -482,21 +488,6 @@ def _standardize_patterns(mol): ">>[S;v6:1](=[O;+0:2])=[O;+0:3]"), ("nitro", "[N;v3:1](-[O-;v1:2])-[O-;v1:3]" ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), - ("conjugated1", "[*-:1]-[*:2]=[*:3]-[*-:4]" - ">>[*;+0:1]=[*:2]-[*:3]=[*;+0:4]"), - ("conjugated2", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*-:6]" - ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*;+0:6]"), - ("conjugated3", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*:6]=[*:7]-[*-:8]" - ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]" - "-[*:7]=[*;+0:8]"), - ("conjugated4", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*:6]=[*:7]-[*:8]" - "=[*:9]-[*-:10]" - ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]" - "-[*:7]=[*:8]-[*:9]=[*;+0:10]"), - ("conjugated5", "[*-:1]-[*:2]=[*:3]-[*:4]=[*:5]-[*:6]=[*:7]-[*:8]" - "=[*:9]-[*:10]=[*:11]-[*-:12]" - ">>[*;+0:1]=[*:2]-[*:3]=[*:4]-[*:5]=[*:6]" - "-[*:7]=[*:8]-[*:9]=[*:10]-[*:11]=[*;+0:12]"), ]: reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) @@ -555,6 +546,95 @@ def _run_reaction(reaction, reactant): return reactant +def _rebuild_conjugated_bonds(mol, max_iter=200): + """Rebuild conjugated bonds without negatively charged atoms at the + begining and end of the conjugated system + + Depending on the order in which atoms are read during the conversion, the + :func:`_infer_bo_and_charges` function might write conjugated systems with + a double bond less and both edges of the system as anions instead of the + usual alternating single and double bonds. This function corrects this + behaviour by using an iterative procedure. + The problematic molecules always follow the same pattern: + `anion(-*=*)n-anion` instead of `*=(*-*=)n*`, where `n` is the number of + successive single and double bonds. The goal of the iterative procedure is + to make `n` as small as possible by consecutively transforming + `anion-*=*` into `*=*-anion` until it reaches the smallest pattern with + `n=1`. This last pattern is then transformed from `anion-*=*-anion` to + `*=*-*=*`. + Since `anion-*=*` is the same as `*=*-anion` in terms of SMARTS, we can + control that we don't tranform the same triplet of atoms back and forth by + adding their indices to a list. + The molecule needs to be kekulized first to also cover systems + with aromatic rings. + + Parameters + ---------- + + mol : rdkit.Chem.rdchem.RWMol + The molecule to transform + max_iter : int + Maximum number of iterations performed by the function + """ + mol.UpdatePropertyCache(strict=False) + Chem.Kekulize(mol) + pattern = Chem.MolFromSmarts("[*-]-[*;+0]=[*;+0;!O]") + end_pattern = Chem.MolFromSmarts("[*-]-[*]=[*]-[*-]") + backtrack = [] + for _ in range(max_iter): + # simplest case where n=1 + end_match = mol.GetSubstructMatch(end_pattern) + if end_match: + # index of each atom + anion1, a1, a2, anion2 = end_match + # charges + mol.GetAtomWithIdx(anion1).SetFormalCharge(0) + mol.GetAtomWithIdx(anion2).SetFormalCharge(0) + # bonds + mol.GetBondBetweenAtoms(anion1, a1).SetBondType( + Chem.BondType.DOUBLE) + mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) + mol.GetBondBetweenAtoms(a2, anion2).SetBondType( + Chem.BondType.DOUBLE) + mol.UpdatePropertyCache(strict=False) + + # shorten the anion-anion pattern from n to n-1 + matches = mol.GetSubstructMatches(pattern) + if matches: + # check if we haven't already transformed this triplet + i = 0 + while i < len(matches): + # sort the indices for the comparison + g = sorted(matches[i]) + if g in backtrack: + # already transformed + i += 1 + else: + # take the first one that hasn't been tried + anion, a1, a2 = matches[i] + backtrack.append(g) + break + else: + anion, a1, a2 = matches[0] + # charges + mol.GetAtomWithIdx(anion).SetFormalCharge(0) + mol.GetAtomWithIdx(a2).SetFormalCharge(-1) + # bonds + mol.GetBondBetweenAtoms(anion, a1).SetBondType( + Chem.BondType.DOUBLE) + mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) + mol.UpdatePropertyCache(strict=False) + # start new iteration + continue + + # no more changes to apply + return + + # reached max_iter + warnings.warn("The standardization could not be completed within a " + "reasonable ammount of iterations") + + def _reassign_props_after_reaction(reactant, product): """Maps back atomic properties from the reactant to the product. The product molecule is modified inplace. diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 1584925f2b8..3fa452ab060 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -49,6 +49,7 @@ def test_converter_requires_rdkit(self): _add_mda_attr_to_rdkit, _infer_bo_and_charges, _standardize_patterns, + _rebuild_conjugated_bonds, _set_atom_property, _reassign_props_after_reaction, ) @@ -441,3 +442,10 @@ def test_order_independant(self, smi_in): assert m.HasSubstructMatch(ref) and ref.HasSubstructMatch( m), "Failed when starting from atom %s%d" % ( a.GetSymbol(), a.GetIndex()) + + def test_warn_conjugated_max_iter(self): + smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" + mol = Chem.MolFromSmiles(smi) + with pytest.warns(UserWarning, + match="reasonable ammount of iterations"): + _rebuild_conjugated_bonds(mol, 2) From 925bd7f2f7f22f2c63320d26032226e51bee1bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 22 Jul 2020 15:40:31 +0200 Subject: [PATCH 64/90] pep8 --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 3fa452ab060..ebd1bd5c80f 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -446,6 +446,6 @@ def test_order_independant(self, smi_in): def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) - with pytest.warns(UserWarning, + with pytest.warns(UserWarning, match="reasonable ammount of iterations"): _rebuild_conjugated_bonds(mol, 2) From e5282f3cf01bb2b5a1bf0f1ea71ea4ee1cf1e8a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 22 Jul 2020 16:53:53 +0200 Subject: [PATCH 65/90] for loop instead of while --- package/MDAnalysis/coordinates/RDKit.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 97822ea5a46..00495db7c96 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -602,16 +602,15 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): matches = mol.GetSubstructMatches(pattern) if matches: # check if we haven't already transformed this triplet - i = 0 - while i < len(matches): + for match in matches: # sort the indices for the comparison - g = sorted(matches[i]) + g = tuple(sorted(match)) if g in backtrack: # already transformed - i += 1 + continue else: # take the first one that hasn't been tried - anion, a1, a2 = matches[i] + anion, a1, a2 = match backtrack.append(g) break else: From b7ac69e4d697ba2b0ccc925453e55e53330fdfc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 23 Jul 2020 17:48:41 +0200 Subject: [PATCH 66/90] bugfix for conjugated system with N at one edge --- package/MDAnalysis/coordinates/RDKit.py | 30 ++++++++++++++----- .../MDAnalysisTests/coordinates/test_rdkit.py | 4 ++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 00495db7c96..8687553ed49 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -479,14 +479,14 @@ def _standardize_patterns(mol): for reactant in Chem.GetMolFrags(mol, asMols=True): for name, reaction in [ - ("Cterm", "[C-;v3:1]=[O:2]>>[C;+0:1]=[O:2]"), - ("Nterm", "[N-;v2;H1:1]>>[N;+0:1]"), + ("Cterm", "[C-;X2:1]=[O:2]>>[C;+0:1]=[O:2]"), + ("Nterm", "[N-;X2;H1:1]>>[N;+0:1]"), ("keto-enolate", "[C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3]"), - ("ARG", "[N;H1:1]-[C-;v3:2](-[N;H2:3])-[N;H2:4]" + ("ARG", "[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]" ">>[N:1]-[C;+0:2](-[N:3])=[N;+1:4]"), - ("sulfone", "[S;v4:1](-[O-;v1:2])-[O-;v1:3]" - ">>[S;v6:1](=[O;+0:2])=[O;+0:3]"), - ("nitro", "[N;v3:1](-[O-;v1:2])-[O-;v1:3]" + ("sulfone", "[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]" + ">>[S:1](=[O;+0:2])=[O;+0:3]"), + ("nitro", "[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]" ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), ]: reactant.UpdatePropertyCache(strict=False) @@ -579,7 +579,21 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) pattern = Chem.MolFromSmarts("[*-]-[*;+0]=[*;+0;!O]") - end_pattern = Chem.MolFromSmarts("[*-]-[*]=[*]-[*-]") + # number of unique matches with the pattern + n_matches = len(set([match[0] + for match in mol.GetSubstructMatches(pattern)])) + if n_matches == 0: + # nothing to standardize + return + # check if there's an even number of anion-*=* patterns + elif n_matches % 2 == 0: + end_pattern = Chem.MolFromSmarts("[*-]-[*]=[*]-[*-]") + end_charge = 0 + else: + # the only way to standardize is to find a nitrogen that can accept + # a double bond and a positive charge + end_pattern = Chem.MolFromSmarts("[*-]-[*]=[*]-[N;X3;v3]") + end_charge = 1 backtrack = [] for _ in range(max_iter): # simplest case where n=1 @@ -589,7 +603,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): anion1, a1, a2, anion2 = end_match # charges mol.GetAtomWithIdx(anion1).SetFormalCharge(0) - mol.GetAtomWithIdx(anion2).SetFormalCharge(0) + mol.GetAtomWithIdx(anion2).SetFormalCharge(end_charge) # bonds mol.GetBondBetweenAtoms(anion1, a1).SetBondType( Chem.BondType.DOUBLE) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index ebd1bd5c80f..cee8a606f2e 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -417,6 +417,7 @@ def test_reassign_props_after_reaction(self, reactant, product, name): "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", "C=CC=CC=CC=CC=CC=C", "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders @@ -433,6 +434,7 @@ def test_order_independant(self, smi_in): smi = Chem.MolToSmiles(template, rootedAtAtom=a.GetIdx()) m = Chem.MolFromSmiles(smi, sanitize=False) for atom in m.GetAtoms(): + atom.SetFormalCharge(0) atom.SetNoImplicit(True) m.UpdatePropertyCache(strict=False) _infer_bo_and_charges(m) @@ -441,7 +443,7 @@ def test_order_independant(self, smi_in): m = Chem.RemoveHs(m) assert m.HasSubstructMatch(ref) and ref.HasSubstructMatch( m), "Failed when starting from atom %s%d" % ( - a.GetSymbol(), a.GetIndex()) + a.GetSymbol(), a.GetIdx()) def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" From aba4b1b9c1952173e84e4cdd27dd857c607864ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 27 Jul 2020 15:25:41 +0200 Subject: [PATCH 67/90] fix typos --- package/MDAnalysis/coordinates/RDKit.py | 18 ++++++++---------- .../MDAnalysisTests/coordinates/test_rdkit.py | 12 +++++------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 8687553ed49..d5dfd125d6a 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -305,9 +305,7 @@ def convert(self, obj, NoImplicit=True): atom_mapper[atom.ix] = index try: - if (len(ag.bonds) == 0) and (ag.n_atoms > 1): - # force guessing bonds - raise NoDataError + ag.bonds except NoDataError: warnings.warn( "No `bonds` attribute in this AtomGroup. Guessing bonds based " @@ -360,7 +358,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): # convert numpy types to python standard types value = value.item() if attr == "names": - # RDKit needs the name to be properly formated for a + # RDKit needs the name to be properly formatted for a # PDB file (1 letter elements start at col 14) name = re.findall(r'(\D+|\d+)', value) if len(name) == 2: @@ -386,7 +384,7 @@ def _set_atom_property(atom, attr, value): def _infer_bo_and_charges(mol, terminal_atom_indices=[]): """Infer bond orders and formal charges from a molecule. - Since most MD topology files don't explicitely retain informations on bond + Since most MD topology files don't explicitly retain information on bond orders or charges, it has to be guessed from the topology. This is done by looping other each atom and comparing its expected valence to the current valence to get the Number of Unpaired Electrons (NUE). @@ -394,8 +392,8 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): If two neighbouring atoms have UEs, the bond between them most likely has to be increased by the value of the smallest NUE. If after this process, an atom still has UEs, it's either a radical - (because one its bonds was cut when creating the AtomGroup) or it needs a - negative formal charge of -NUE. Since these radical atoms can be detected + (because one of its bonds was cut when creating the AtomGroup) or it needs + a negative formal charge of -NUE. Since these radical atoms can be detected when looping over the bonds of the AtomGroup, only atoms that are not part of this "terminal_atoms" list will be assigned a negative formal charge. @@ -548,7 +546,7 @@ def _run_reaction(reaction, reactant): def _rebuild_conjugated_bonds(mol, max_iter=200): """Rebuild conjugated bonds without negatively charged atoms at the - begining and end of the conjugated system + beginning and end of the conjugated system Depending on the order in which atoms are read during the conversion, the :func:`_infer_bo_and_charges` function might write conjugated systems with @@ -563,7 +561,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): `n=1`. This last pattern is then transformed from `anion-*=*-anion` to `*=*-*=*`. Since `anion-*=*` is the same as `*=*-anion` in terms of SMARTS, we can - control that we don't tranform the same triplet of atoms back and forth by + control that we don't transform the same triplet of atoms back and forth by adding their indices to a list. The molecule needs to be kekulized first to also cover systems with aromatic rings. @@ -645,7 +643,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): # reached max_iter warnings.warn("The standardization could not be completed within a " - "reasonable ammount of iterations") + "reasonable number of iterations") def _reassign_props_after_reaction(reactant, product): diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index cee8a606f2e..5b278b436d1 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -28,7 +28,7 @@ from numpy.testing import (assert_equal, assert_almost_equal) -from MDAnalysisTests.datafiles import mol2_molecule, PDB_full, GRO +from MDAnalysisTests.datafiles import mol2_molecule, PDB_full, GRO, PDB_helix from MDAnalysisTests.util import block_import, import_not_available @@ -228,13 +228,11 @@ def test_raise_requires_elements(self): ): u.atoms.convert_to("RDKIT") - def test_warn_guess_bonds(self, pdb): - pdb.delete_bonds(pdb.bonds) - ag = pdb.select_atoms("resnum 101 and segid A") - pdb.delete_bonds(ag.bonds) + def test_warn_guess_bonds(self): + u = mda.Universe(PDB_helix) with pytest.warns(UserWarning, match="No `bonds` attribute in this AtomGroup"): - ag.convert_to("RDKIT") + u.atoms.convert_to("RDKIT") def test_warn_no_hydrogen(self): u = mda.Universe.from_smiles("O=O") @@ -449,5 +447,5 @@ def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) with pytest.warns(UserWarning, - match="reasonable ammount of iterations"): + match="reasonable number of iterations"): _rebuild_conjugated_bonds(mol, 2) From 49f3bff16a6db408a0ab55defa5454810db00980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 27 Jul 2020 15:45:07 +0200 Subject: [PATCH 68/90] fix test for min deps --- .../MDAnalysisTests/coordinates/test_rdkit.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 5b278b436d1..d8b99e8a94e 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -32,15 +32,6 @@ from MDAnalysisTests.util import block_import, import_not_available -@block_import('rdkit') -class TestRequiresRDKit(object): - def test_converter_requires_rdkit(self): - u = mda.Universe(mol2_molecule) - with pytest.raises(ImportError, - match="RDKit is required for the RDKitConverter"): - u.atoms.convert_to("RDKIT") - - try: from rdkit import Chem from rdkit.Chem import AllChem @@ -54,6 +45,8 @@ def test_converter_requires_rdkit(self): _reassign_props_after_reaction, ) except ImportError: + rdkit_installed = False + def mol2_mol(): pass @@ -72,6 +65,8 @@ def dummy_reactant(): def dummy_reactant_noprops(): pass else: + rdkit_installed = True + def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) @@ -117,6 +112,16 @@ def dummy_reactant(): reason="requires RDKit") +@pytest.mark.skipif(rdkit_installed, + reason="only for min dependencies build") +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + u = mda.Universe(PDB_full) + with pytest.raises(ImportError, + match="RDKit is required for the RDKitConverter"): + u.atoms.convert_to("RDKIT") + + @requires_rdkit class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ From 70cce99ab34123dcf706dfe63dc43b0d0f9a4975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 27 Jul 2020 16:21:00 +0200 Subject: [PATCH 69/90] use indirect parametrization --- .../MDAnalysisTests/coordinates/test_rdkit.py | 81 ++++++++----------- 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index d8b99e8a94e..6afe345e876 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -29,7 +29,7 @@ assert_almost_equal) from MDAnalysisTests.datafiles import mol2_molecule, PDB_full, GRO, PDB_helix -from MDAnalysisTests.util import block_import, import_not_available +from MDAnalysisTests.util import import_not_available try: @@ -45,28 +45,25 @@ _reassign_props_after_reaction, ) except ImportError: - rdkit_installed = False + pass - def mol2_mol(): - pass - def smiles_mol(): - pass +requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), + reason="requires RDKit") - def dummy_product(): - pass - def dummy_product_nomap(): - pass +@pytest.mark.skipif(not import_not_available("rdkit"), + reason="only for min dependencies build") +class TestRequiresRDKit(object): + def test_converter_requires_rdkit(self): + u = mda.Universe(PDB_full) + with pytest.raises(ImportError, + match="RDKit is required for the RDKitConverter"): + u.atoms.convert_to("RDKIT") - def dummy_reactant(): - pass - - def dummy_reactant_noprops(): - pass -else: - rdkit_installed = True +@requires_rdkit +class MolFactory: def mol2_mol(): return Chem.MolFromMol2File(mol2_molecule, removeHs=False) @@ -108,26 +105,22 @@ def dummy_reactant(): return mol -requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), - reason="requires RDKit") +@pytest.fixture(scope="function") +def rdmol(request): + return getattr(MolFactory, request.param)() -@pytest.mark.skipif(rdkit_installed, - reason="only for min dependencies build") -class TestRequiresRDKit(object): - def test_converter_requires_rdkit(self): - u = mda.Universe(PDB_full) - with pytest.raises(ImportError, - match="RDKit is required for the RDKitConverter"): - u.atoms.convert_to("RDKIT") +@pytest.fixture(scope="function") +def product(request): + return getattr(MolFactory, request.param)() @requires_rdkit class TestRDKitReader(object): @pytest.mark.parametrize("rdmol, n_frames", [ - (mol2_mol(), 1), - (smiles_mol(), 3), - ]) + ("mol2_mol", 1), + ("smiles_mol", 3), + ], indirect=["rdmol"]) def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames @@ -144,7 +137,7 @@ def test_no_coordinates(self): assert_equal(u.trajectory.coordinate_array, expected) def test_compare_mol2reader(self): - universe = mda.Universe(mol2_mol()) + universe = mda.Universe(MolFactory.mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames assert_equal(universe.trajectory.ts.positions, @@ -212,15 +205,9 @@ def test_monomer_info(self, pdb, sel_str, atom_index): rd_value = rd_value.strip() assert rd_value == mda_value - def test_identical_topology_mol2(self, mol2): - """Check stereochemistry on atoms and bonds (but not yet)""" - rdmol = mol2_mol() - umol = mol2.atoms.convert_to("RDKIT") - assert rdmol.HasSubstructMatch(umol, useChirality=False) - assert umol.HasSubstructMatch(rdmol, useChirality=False) - - def test_identical_topology(self): - rdmol = smiles_mol() + @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], + indirect=True) + def test_identical_topology(self, rdmol): u = mda.Universe(rdmol) umol = u.atoms.convert_to("RDKIT") assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) @@ -381,13 +368,13 @@ def test_set_atom_property(self, attr, value, getter): _set_atom_property(atom, prop, value) assert getattr(atom, getter)(prop) == value - @pytest.mark.parametrize("reactant, product, name", [ - (dummy_reactant(), dummy_product(), "props"), - (dummy_reactant_noprops(), dummy_product(), "noprops"), - (dummy_reactant(), dummy_product_nomap(), "nomap"), - ]) - def test_reassign_props_after_reaction(self, reactant, product, name): - _reassign_props_after_reaction(reactant, product) + @pytest.mark.parametrize("rdmol, product, name", [ + ("dummy_reactant", "dummy_product", "props"), + ("dummy_reactant_noprops", "dummy_product", "noprops"), + ("dummy_reactant", "dummy_product_nomap", "nomap"), + ], indirect=["rdmol", "product"]) + def test_reassign_props_after_reaction(self, rdmol, product, name): + _reassign_props_after_reaction(rdmol, product) atom = product.GetAtomWithIdx(0) if name == "props": assert atom.GetProp("foo") == "bar" From e82cf0a3200f9faae08ad63806147e86eb605832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 29 Jul 2020 15:59:13 +0200 Subject: [PATCH 70/90] add coordinates + tests + fix index property --- package/MDAnalysis/coordinates/RDKit.py | 14 +++++- .../MDAnalysisTests/coordinates/test_rdkit.py | 43 +++++++++++++++---- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index d5dfd125d6a..4ffa064b295 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -299,7 +299,7 @@ def convert(self, obj, NoImplicit=True): value = other_attrs[attr][i] attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular _set_atom_property(rdatom, attr, value) - _set_atom_property(rdatom, "_MDAnalysis_index", int(atom.ix)) + _set_atom_property(rdatom, "_MDAnalysis_index", i) # add atom index = mol.AddAtom(rdatom) atom_mapper[atom.ix] = index @@ -337,6 +337,18 @@ def convert(self, obj, NoImplicit=True): # sanitize Chem.SanitizeMol(mol) + if hasattr(ag, "positions") and not np.isnan(ag.positions).any(): + # assign coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for atom in mol.GetAtoms(): + idx = atom.GetIntProp("_MDAnalysis_index") + xyz = [float(pos) for pos in ag.positions[idx]] + conf.SetAtomPosition(atom.GetIdx(), xyz) + mol.AddConformer(conf) + # assign R/S to atoms and Z/E to bonds + Chem.AssignStereochemistryFrom3D(mol) + Chem.SetDoubleBondNeighborDirections(mol) + return mol diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 6afe345e876..71596813a4f 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -193,8 +193,7 @@ def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) umol = sel.convert_to("RDKIT") atom = umol.GetAtomWithIdx(atom_index) - mda_index = np.where( - sel.indices == atom.GetIntProp("_MDAnalysis_index")) + mda_index = atom.GetIntProp("_MDAnalysis_index") mi = atom.GetMonomerInfo() for mda_attr, rd_attr in RDATTRIBUTES.items(): @@ -268,10 +267,12 @@ def test_bfactors_tempfactors_raises_error(self): @pytest.mark.parametrize("idx", [0, 10, 42]) def test_other_attributes(self, mol2, idx): mol = mol2.atoms.convert_to("RDKIT") - rdprops = mol.GetAtomWithIdx(idx).GetPropsAsDict() + rdatom = mol.GetAtomWithIdx(idx) + rdprops = rdatom.GetPropsAsDict() + mda_idx = int(rdprops["_MDAnalysis_index"]) for prop in ["charge", "segid", "type"]: rdprop = rdprops["_MDAnalysis_%s" % prop] - mdaprop = getattr(mol2.atoms[idx], prop) + mdaprop = getattr(mol2.atoms[mda_idx], prop) assert rdprop == mdaprop @pytest.mark.parametrize("sel_str", [ @@ -281,12 +282,37 @@ def test_other_attributes(self, mol2, idx): def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) mol = ag.convert_to("RDKIT") - expected = ag.indices - indices = np.array([a.GetIntProp("_MDAnalysis_index") - for a in mol.GetAtoms()], dtype=np.int32) - indices.sort() + expected = [i for i in range(len(ag))] + indices = sorted([a.GetIntProp("_MDAnalysis_index") + for a in mol.GetAtoms()]) assert_equal(indices, expected) + def test_assign_coordinates(self, pdb): + mol = pdb.atoms.convert_to("RDKIT") + positions = mol.GetConformer().GetPositions() + indices = sorted(mol.GetAtoms(), + key=lambda a: a.GetIntProp("_MDAnalysis_index")) + indices = [a.GetIdx() for a in indices] + assert_equal(positions[indices], pdb.atoms.positions) + + def test_assign_stereochemistry(self, mol2): + umol = mol2.atoms.convert_to("RDKIT") + rdmol = Chem.MolFromMol2File(mol2_molecule, removeHs=False) + assert rdmol.HasSubstructMatch( + umol, useChirality=True) and umol.HasSubstructMatch( + rdmol, useChirality=True) + + def test_trajectory_coords(self): + u = mda.Universe.from_smiles( + "CCO", numConfs=3, rdkit_kwargs=dict(randomSeed=42)) + for ts in u.trajectory: + mol = u.atoms.convert_to("RDKIT") + positions = mol.GetConformer().GetPositions() + indices = sorted(mol.GetAtoms(), + key=lambda a: a.GetIntProp("_MDAnalysis_index")) + indices = [a.GetIdx() for a in indices] + assert_equal(positions[indices], ts.positions) + @requires_rdkit class TestRDKitFunctions(object): @@ -408,6 +434,7 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "C=CC=CC=CC=CC=CC=C", "NCCCCC([NH3+])C(=O)[O-]", "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From 3427c3c5f6134f582233ee4bfabdd19e7bfaf09d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Wed, 29 Jul 2020 18:22:29 +0200 Subject: [PATCH 71/90] cache the molecule and only update conformers when iterating trajectory --- package/MDAnalysis/coordinates/RDKit.py | 58 ++++++++++++++----- .../MDAnalysisTests/coordinates/test_rdkit.py | 22 ++++++- 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 4ffa064b295..3533c202138 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -60,6 +60,7 @@ import warnings import re +import copy import numpy as np @@ -219,8 +220,9 @@ class RDKitConverter(base.ConverterBase): lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} + _cache = dict() - def convert(self, obj, NoImplicit=True): + def convert(self, obj, **kwargs): """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. @@ -244,7 +246,45 @@ def convert(self, obj, NoImplicit=True): raise TypeError("No `atoms` attribute in object of type {}, " "please use a valid AtomGroup or Universe".format( type(obj))) from None + + # create the topology + key = id(ag) + # search for it in the cache first + try: + mol = self._cache[key] + except KeyError: + # only keep the current molecule in cache + self._cache.clear() + self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs) + # continue on copy of the cached molecule + mol = copy.deepcopy(mol) + + # add a conformer for the current Timestep + if hasattr(ag, "positions") and not np.isnan(ag.positions).any(): + # assign coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for atom in mol.GetAtoms(): + idx = atom.GetIntProp("_MDAnalysis_index") + xyz = [float(pos) for pos in ag.positions[idx]] + conf.SetAtomPosition(atom.GetIdx(), xyz) + mol.AddConformer(conf) + # assign R/S to atoms and Z/E to bonds + Chem.AssignStereochemistryFrom3D(mol) + Chem.SetDoubleBondNeighborDirections(mol) + return mol + + + def atomgroup_to_mol(self, ag, NoImplicit=True): + """Converts an AtomGroup to an RDKit molecule. + + Parameters + ----------- + ag : AtomGroup + + NoImplicit : bool + Prevent adding hydrogens to the molecule + """ try: elements = ag.elements except NoDataError: @@ -321,8 +361,8 @@ def convert(self, obj, NoImplicit=True): # can happen for terminal atoms. # save the bond atom that is in the atomgroup for later terminal_atom_indices.extend([atom_mapper[i] - for i in bond.indices - if i in atom_mapper.keys()]) + for i in bond.indices + if i in atom_mapper.keys()]) # skip adding this bond continue bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) @@ -337,18 +377,6 @@ def convert(self, obj, NoImplicit=True): # sanitize Chem.SanitizeMol(mol) - if hasattr(ag, "positions") and not np.isnan(ag.positions).any(): - # assign coordinates - conf = Chem.Conformer(mol.GetNumAtoms()) - for atom in mol.GetAtoms(): - idx = atom.GetIntProp("_MDAnalysis_index") - xyz = [float(pos) for pos in ag.positions[idx]] - conf.SetAtomPosition(atom.GetIdx(), xyz) - mol.AddConformer(conf) - # assign R/S to atoms and Z/E to bonds - Chem.AssignStereochemistryFrom3D(mol) - Chem.SetDoubleBondNeighborDirections(mol) - return mol diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 71596813a4f..0511462efcf 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -309,10 +309,30 @@ def test_trajectory_coords(self): mol = u.atoms.convert_to("RDKIT") positions = mol.GetConformer().GetPositions() indices = sorted(mol.GetAtoms(), - key=lambda a: a.GetIntProp("_MDAnalysis_index")) + key=lambda a: a.GetIntProp("_MDAnalysis_index")) indices = [a.GetIdx() for a in indices] assert_equal(positions[indices], ts.positions) + def test_cache(self): + u = mda.Universe.from_smiles("CCO", numConfs=5) + ag = u.atoms + cache = mda.coordinates.RDKit.RDKitConverter._cache + previous_cache = None + for ts in u.trajectory: + mol = ag.convert_to("RDKIT") + if previous_cache: + # the cache shouldn't change when iterating on timesteps + assert cache == previous_cache + previous_cache = cache + # cached molecule shouldn't store coordinates + mol = list(cache.values())[0] + with pytest.raises(ValueError, match="Bad Conformer Id"): + mol.GetConformer() + # only 1 molecule should be cached + u = mda.Universe.from_smiles("C") + assert len(cache) == 1 + assert cache != previous_cache + @requires_rdkit class TestRDKitFunctions(object): From 5218daab7479f730b10ef384d463f30105879365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 30 Jul 2020 11:43:35 +0200 Subject: [PATCH 72/90] update docs --- package/MDAnalysis/coordinates/RDKit.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 3533c202138..f597d650651 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -211,6 +211,9 @@ class RDKitConverter(base.ConverterBase): guessed if not present. If both `tempfactors` and `bfactors` attributes are present, the conversion will fail, since only one of these should be present. + Hydrogens should be explicit in the topology file. If this is not the case, + use the parameter `NoImplicit=False` when using the converter to allow + implicit hydrogens and disable inferring bond orders and charges. .. versionadded:: 2.0.0 @@ -265,7 +268,7 @@ def convert(self, obj, **kwargs): conf = Chem.Conformer(mol.GetNumAtoms()) for atom in mol.GetAtoms(): idx = atom.GetIntProp("_MDAnalysis_index") - xyz = [float(pos) for pos in ag.positions[idx]] + xyz = ag.positions[idx].astype(float) conf.SetAtomPosition(atom.GetIdx(), xyz) mol.AddConformer(conf) # assign R/S to atoms and Z/E to bonds @@ -300,7 +303,10 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): "converter requires all hydrogens to be explicit. Please " "check carefully the output molecule as the converter is " "likely to add negative charges and assign incorrect bond " - "orders to structures with implicit hydrogens." + "orders to structures with implicit hydrogens. Alternatively, " + "you can use the parameter `NoImplicit=False` when using the " + "converter to allow implicit hydrogens and disable inferring " + "bond orders and charges." ) # attributes accepted in PDBResidueInfo object From 3313b567c5d52d962f465adb72b399136d490417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 30 Jul 2020 12:33:41 +0200 Subject: [PATCH 73/90] add kwargs to the cache key --- package/MDAnalysis/coordinates/RDKit.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index f597d650651..d63bc6c748e 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -249,9 +249,10 @@ def convert(self, obj, **kwargs): raise TypeError("No `atoms` attribute in object of type {}, " "please use a valid AtomGroup or Universe".format( type(obj))) from None - + # create the topology - key = id(ag) + key = "<%x>" % id(ag) + ",".join( + ["%s=%s" % (str(k), str(v)) for k, v in kwargs.items()]) # search for it in the cache first try: mol = self._cache[key] @@ -277,10 +278,9 @@ def convert(self, obj, **kwargs): return mol - def atomgroup_to_mol(self, ag, NoImplicit=True): """Converts an AtomGroup to an RDKit molecule. - + Parameters ----------- ag : AtomGroup @@ -367,8 +367,8 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): # can happen for terminal atoms. # save the bond atom that is in the atomgroup for later terminal_atom_indices.extend([atom_mapper[i] - for i in bond.indices - if i in atom_mapper.keys()]) + for i in bond.indices + if i in atom_mapper.keys()]) # skip adding this bond continue bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) From a72a51b007f9136e36a83b2906f70d7280a2355f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 31 Jul 2020 12:41:06 +0200 Subject: [PATCH 74/90] use fstrings --- package/MDAnalysis/coordinates/RDKit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index d63bc6c748e..7b692101df6 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -251,8 +251,8 @@ def convert(self, obj, **kwargs): type(obj))) from None # create the topology - key = "<%x>" % id(ag) + ",".join( - ["%s=%s" % (str(k), str(v)) for k, v in kwargs.items()]) + key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}" + for key, value in kwargs.items()) # search for it in the cache first try: mol = self._cache[key] From cec1f680401f650fbc1bfe2c96cace2da7543e13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 10 Aug 2020 16:27:05 +0200 Subject: [PATCH 75/90] cache kwarg + fix cache tests --- package/MDAnalysis/coordinates/RDKit.py | 36 ++++++++++++------- .../MDAnalysisTests/coordinates/test_rdkit.py | 11 +++++- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 7b692101df6..768f14e3434 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -225,7 +225,7 @@ class RDKitConverter(base.ConverterBase): units = {'time': None, 'length': 'Angstrom'} _cache = dict() - def convert(self, obj, **kwargs): + def convert(self, obj, cache=True, **kwargs): """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. @@ -233,8 +233,14 @@ def convert(self, obj, **kwargs): ----------- obj : AtomGroup or Universe + cache : bool + Use a cached copy of the molecule's topology when available. To be + used, the cached molecule and the new one have to be made from the + same AtomGroup object (same id) and with the same arguments passed + to the converter (with the exception of this `cache` argument) + NoImplicit : bool - Prevent adding hydrogens to the molecule + Prevent adding hydrogens to the molecule (default: True) """ try: from rdkit import Chem @@ -250,18 +256,22 @@ def convert(self, obj, **kwargs): "please use a valid AtomGroup or Universe".format( type(obj))) from None - # create the topology - key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}" - for key, value in kwargs.items()) - # search for it in the cache first - try: - mol = self._cache[key] - except KeyError: - # only keep the current molecule in cache + if cache: + # key used to search the cache + key = f"<{id(ag):#x}>" + ",".join(f"{key}={value}" + for key, value in kwargs.items()) + try: + mol = self._cache[key] + except KeyError: + # only keep the current molecule in cache + self._cache.clear() + # create the topology + self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs) + # continue on copy of the cached molecule + mol = copy.deepcopy(mol) + else: self._cache.clear() - self._cache[key] = mol = self.atomgroup_to_mol(ag, **kwargs) - # continue on copy of the cached molecule - mol = copy.deepcopy(mol) + mol = self.atomgroup_to_mol(ag, **kwargs) # add a conformer for the current Timestep if hasattr(ag, "positions") and not np.isnan(ag.positions).any(): diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 0511462efcf..7471a886933 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -21,6 +21,7 @@ # J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 # +import copy import pytest import MDAnalysis as mda from MDAnalysis.topology.guessers import guess_atom_element @@ -323,15 +324,23 @@ def test_cache(self): if previous_cache: # the cache shouldn't change when iterating on timesteps assert cache == previous_cache - previous_cache = cache + previous_cache = copy.deepcopy(cache) # cached molecule shouldn't store coordinates mol = list(cache.values())[0] with pytest.raises(ValueError, match="Bad Conformer Id"): mol.GetConformer() # only 1 molecule should be cached u = mda.Universe.from_smiles("C") + mol = u.atoms.convert_to("RDKIT") assert len(cache) == 1 assert cache != previous_cache + # cache should depend on passed arguments + previous_cache = copy.deepcopy(cache) + mol = u.atoms.convert_to("RDKIT", NoImplicit=False) + assert cache != previous_cache + # skip cache + mol = u.atoms.convert_to("RDKIT", cache=False) + assert cache == {} @requires_rdkit From 6c4c12380fc806012d09708216bee25ae5185981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 Aug 2020 15:50:15 +0200 Subject: [PATCH 76/90] fix code review + improve docs --- package/MDAnalysis/coordinates/RDKit.py | 103 ++++++++++-------- .../MDAnalysisTests/coordinates/test_rdkit.py | 53 +++++---- 2 files changed, 83 insertions(+), 73 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 768f14e3434..1411e27bf88 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -53,8 +53,11 @@ .. autoclass:: RDKitConverter :members: -.. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol + .. automethod:: RDKitConverter._infer_bo_and_charges + .. automethod:: RDKitConverter._standardize_patterns + .. automethod:: RDKitConverter._rebuild_conjugated_bonds +.. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol """ @@ -128,7 +131,6 @@ def __init__(self, filename, **kwargs): Parameters ---------- - filename : rdkit.Chem.rdchem.Mol RDKit molecule """ @@ -207,14 +209,35 @@ class RDKitConverter(base.ConverterBase): The converter requires the :class:`~MDAnalysis.core.topologyattrs.Elements` attribute to be present in the topology, else it will fail. + It also requires the `bonds` attribute, although they will be automatically guessed if not present. - If both `tempfactors` and `bfactors` attributes are present, the conversion - will fail, since only one of these should be present. + + If both ``tempfactors`` and ``bfactors`` attributes are present, the + conversion will fail, since only one of these should be present. + TODO: Wait for Issue #1901 for a solution + Hydrogens should be explicit in the topology file. If this is not the case, - use the parameter `NoImplicit=False` when using the converter to allow + use the parameter ``NoImplicit=False`` when using the converter to allow implicit hydrogens and disable inferring bond orders and charges. + Since one of the main use case of the converter is converting trajectories + and not just a topology, creating a new molecule from scratch for every + frame would be too slow so the converter uses a caching system. The cache + only remembers the id of the last AtomGroup that was converted, as well + as the arguments that were passed to the converter. This means that using + ``u.select_atoms("protein").convert_to("RDKIT")`` will not benefit from the + cache since the selection is deleted from memory as soon as the conversion + is finished. Instead, users should do this in two steps by first saving the + selection in a variable and then converting the saved AtomGroup. It also + means that ``ag.convert_to("RDKIT")`` followed by + ``ag.convert_to("RDKIT", NoImplicit=True)`` will not use the cache. + Finally if you're modifying the AtomGroup in place between two conversions, + the id of the AtomGroup won't change and thus the converter will use the + cached molecule. For this reason, you can pass a ``cache=False`` argument + to the converter to bypass the caching system. + The cached molecule doesn't contain the coordinates of the atoms. + .. versionadded:: 2.0.0 @@ -225,23 +248,25 @@ class RDKitConverter(base.ConverterBase): units = {'time': None, 'length': 'Angstrom'} _cache = dict() - def convert(self, obj, cache=True, **kwargs): + def convert(self, obj, cache=True, NoImplicit=True): """Write selection at current trajectory frame to :class:`rdkit.Chem.rdchem.Mol`. Parameters ----------- - obj : AtomGroup or Universe + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the same AtomGroup object (same id) and with the same arguments passed to the converter (with the exception of this `cache` argument) - NoImplicit : bool - Prevent adding hydrogens to the molecule (default: True) + Prevent adding hydrogens to the molecule """ + # parameters passed to atomgroup_to_mol and used by the cache + kwargs = dict(NoImplicit=NoImplicit) + try: from rdkit import Chem except ImportError: @@ -293,8 +318,7 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): Parameters ----------- - ag : AtomGroup - + ag : :class:`~MDAnalysis.core.groups.AtomGroup` NoImplicit : bool Prevent adding hydrogens to the molecule """ @@ -368,27 +392,20 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): "on atoms coordinates") ag.guess_bonds() - terminal_atom_indices = [] for bond in ag.bonds: try: bond_indices = [atom_mapper[i] for i in bond.indices] except KeyError: - # one of the atoms of the bond is not part of the atomgroup. - # can happen for terminal atoms. - # save the bond atom that is in the atomgroup for later - terminal_atom_indices.extend([atom_mapper[i] - for i in bond.indices - if i in atom_mapper.keys()]) - # skip adding this bond continue bond_type = RDBONDORDER.get(bond.order, Chem.BondType.SINGLE) mol.AddBond(*bond_indices, bond_type) mol.UpdatePropertyCache(strict=False) - # infer bond orders and formal charges from the connectivity - _infer_bo_and_charges(mol, terminal_atom_indices) - mol = _standardize_patterns(mol) + if NoImplicit: + # infer bond orders and formal charges from the connectivity + _infer_bo_and_charges(mol) + mol = _standardize_patterns(mol) # sanitize Chem.SanitizeMol(mol) @@ -402,12 +419,11 @@ def _add_mda_attr_to_rdkit(attr, value, mi): Parameters ---------- - attr : str Name of the atom attribute in MDAnalysis in the singular form value : object, np.int or np.float Attribute value as found in the AtomGroup - mi : rdkit.Chem.rdchem.AtomPDBResidueInfo + mi : :class:`rdkit.Chem.rdchem.AtomPDBResidueInfo` MonomerInfo object that will store the relevant atom attributes """ if isinstance(value, np.generic): @@ -437,7 +453,7 @@ def _set_atom_property(atom, attr, value): atom.SetProp(attr, value) -def _infer_bo_and_charges(mol, terminal_atom_indices=[]): +def _infer_bo_and_charges(mol): """Infer bond orders and formal charges from a molecule. Since most MD topology files don't explicitly retain information on bond @@ -447,20 +463,19 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): If an atom has a negative NUE, it needs a positive formal charge (-NUE). If two neighbouring atoms have UEs, the bond between them most likely has to be increased by the value of the smallest NUE. - If after this process, an atom still has UEs, it's either a radical - (because one of its bonds was cut when creating the AtomGroup) or it needs - a negative formal charge of -NUE. Since these radical atoms can be detected - when looping over the bonds of the AtomGroup, only atoms that are not part - of this "terminal_atoms" list will be assigned a negative formal charge. + If after this process, an atom still has UEs, it needs a negative formal + charge of -NUE. Parameters ---------- - - mol : rdkit.Chem.rdchem.RWMol + mol : :class:`rdkit.Chem.rdchem.RWMol` The molecule is modified inplace and must have all hydrogens added - terminal_atom_indices : list - List of terminal atoms indices, i.e. atoms at the edges of a molecule + Notes + ----- + This algorithm is order dependant. For example, for a carboxylate group + R-C(-O)-O the first oxygen read will receive a double bond and the other + one will be charged. It will also affect more complex conjugated systems. """ for atom in mol.GetAtoms(): @@ -508,18 +523,16 @@ def _infer_bo_and_charges(mol, terminal_atom_indices=[]): current_v = atom.GetTotalValence() - atom.GetFormalCharge() nue = [v - current_v for v in expected_vs][0] if nue > 0: - # keep the radical if it's a terminal atom - # else transform it to a negative charge - if atom.GetIdx() not in terminal_atom_indices: - atom.SetFormalCharge(-nue) - atom.SetNumRadicalElectrons(0) - mol.UpdatePropertyCache(strict=False) + # transform it to a negative charge + atom.SetFormalCharge(-nue) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) def _standardize_patterns(mol): """Standardizes functional groups - Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, + Uses :func:`~_rebuild_conjugated_bonds` to standardize conjugated systems, and SMARTS reactions for other functional groups. Due to the way reactions work, we first have to split the molecule by fragments. Then, for each fragment, we apply the standardization reactions. @@ -566,15 +579,14 @@ def _run_reaction(reaction, reactant): Parameters ---------- - reaction : str SMARTS reaction - reactant : rdkit.Chem.rdchem.RWMol + reactant : :class:`rdkit.Chem.rdchem.RWMol` The molecule to transform Returns ------- - Final product of the reaction, as an rdkit.Chem.rdchem.RWMol + Final product of the reaction, as an :class:`rdkit.Chem.rdchem.RWMol` """ # count how many times the reaction should be run pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) @@ -624,8 +636,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): Parameters ---------- - - mol : rdkit.Chem.rdchem.RWMol + mol : :class:`rdkit.Chem.rdchem.RWMol` The molecule to transform max_iter : int Maximum number of iterations performed by the function diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 7471a886933..d4041de4c54 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -334,41 +334,38 @@ def test_cache(self): mol = u.atoms.convert_to("RDKIT") assert len(cache) == 1 assert cache != previous_cache - # cache should depend on passed arguments - previous_cache = copy.deepcopy(cache) - mol = u.atoms.convert_to("RDKIT", NoImplicit=False) - assert cache != previous_cache - # skip cache - mol = u.atoms.convert_to("RDKIT", cache=False) - assert cache == {} + # TODO: uncomment once the converters API accepts arguments + # # cache should depend on passed arguments + # previous_cache = copy.deepcopy(cache) + # mol = u.atoms.convert_to("RDKIT", NoImplicit=False) + # assert cache != previous_cache + # # skip cache + # mol = u.atoms.convert_to("RDKIT", cache=False) + # assert cache == {} @requires_rdkit class TestRDKitFunctions(object): - @pytest.mark.parametrize("smi, edges, out", [ - ("C(-[H])(-[H])(-[H])-[H]", [], "C"), - ("[C](-[H])(-[H])-[C](-[H])-[H]", [], "C=C"), - ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", [], + @pytest.mark.parametrize("smi, out", [ + ("C(-[H])(-[H])(-[H])-[H]", "C"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", "c1ccccc1"), - ("C-[C](-[H])-[O]", [], "C(=O)C"), - ("[H]-[C](-[O])-[N](-[H])-[H]", [], "C(=O)N"), - ("[N]-[C]-[H]", [], "N#C"), - ("C-[C](-[O]-[H])-[O]", [], "CC(=O)O"), - ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", [], "P(O)(O)(O)=O"), - ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", [], "P([O-])(O)(O)=O"), - ("[P](-[O]-[H])(-[O])(-[O])-[O]", [], "P([O-])([O-])(O)=O"), - ("[P](-[O])(-[O])(-[O])-[O]", [], "P([O-])([O-])([O-])=O"), - ("[H]-[O]-[N]-[O]", [], "ON=O"), - ("[N]-[C]-[O]", [], "N#C[O-]"), - ("[C](-[H])(-[H])-[Cl]", [0], "[H][C]([H])Cl"), - ("[C](-[H])-[C](-[H])-[H]", [0], "[H][C]=C([H])[H]"), - ("[C](-[H])-[Cl]", [0], "[H][C]Cl"), - ("[C](-[O])-[Cl]", [0], "O=[C]Cl"), + ("C-[C](-[H])-[O]", "C(=O)C"), + ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), + ("[N]-[C]-[H]", "N#C"), + ("C-[C](-[O]-[H])-[O]", "CC(=O)O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", "P(O)(O)(O)=O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", "P([O-])(O)(O)=O"), + ("[P](-[O]-[H])(-[O])(-[O])-[O]", "P([O-])([O-])(O)=O"), + ("[P](-[O])(-[O])(-[O])-[O]", "P([O-])([O-])([O-])=O"), + ("[H]-[O]-[N]-[O]", "ON=O"), + ("[N]-[C]-[O]", "N#C[O-]"), ]) - def test_infer_bond_orders(self, smi, edges, out): + def test_infer_bond_orders(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol, edges) + _infer_bo_and_charges(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) @@ -398,6 +395,8 @@ def test_infer_charges(self, smi, atom_idx, charge): ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), + ("[C-](=O)-C", "[C](=O)-C"), + ("[H]-[N-]-C", "[H]-[N]-C"), ]) def test_standardize_patterns(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) From cdb60b277d45c666120ca2001ed8e2220b58505b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 Aug 2020 20:01:36 +0200 Subject: [PATCH 77/90] improve doc --- package/MDAnalysis/coordinates/RDKit.py | 107 +++++++++++++++--------- package/doc/sphinx/source/conf.py | 1 + 2 files changed, 68 insertions(+), 40 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 1411e27bf88..580ae6687d6 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -24,8 +24,8 @@ """RDKit molecule I/O --- :mod:`MDAnalysis.coordinates.RDKit` ================================================================ -Read coordinates data from an `RDKit`_ :class:`rdkit.Chem.rdchem.Mol` with -:class:`RDKitReader` into an MDAnalysis Universe. Convert it back to an +Read coordinates data from an `RDKit `_ :class:`rdkit.Chem.rdchem.Mol` with +:class:`RDKitReader` into an MDAnalysis Universe. Convert it back to a :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. @@ -41,7 +41,7 @@ >>> u.trajectory >>> u.atoms.convert_to("RDKIT") - + Classes @@ -53,11 +53,11 @@ .. autoclass:: RDKitConverter :members: - .. automethod:: RDKitConverter._infer_bo_and_charges - .. automethod:: RDKitConverter._standardize_patterns - .. automethod:: RDKitConverter._rebuild_conjugated_bonds +.. autofunction:: _infer_bo_and_charges -.. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol +.. autofunction:: _standardize_patterns + +.. autofunction:: _rebuild_conjugated_bonds """ @@ -147,17 +147,18 @@ def __init__(self, filename, **kwargs): class RDKitConverter(base.ConverterBase): """Convert MDAnalysis :class:`~MDAnalysis.core.groups.AtomGroup` or - :class:`~MDAnalysis.core.universe.Universe` to `RDKit`_ - :class:`rdkit.Chem.rdchem.Mol` + :class:`~MDAnalysis.core.universe.Universe` to RDKit + :class:`~rdkit.Chem.rdchem.Mol` - MDanalysis attributes are stored in each RDKit atom of the resulting - molecule in two different ways: + MDanalysis attributes are stored in each RDKit + :class:`~rdkit.Chem.rdchem.Atom` of the resulting molecule in two different + ways: - * in an `AtomPDBResidueInfo` object available through the - ``atom.GetMonomerInfo()`` method if it's an attribute that is typically - found in a PDB file, + * in an :class:`~rdkit.Chem.rdchem.AtomPDBResidueInfo` object available + through the :meth:`~rdkit.Chem.rdchem.Atom.GetMonomerInfo` method if it's + an attribute that is typically found in a PDB file, * directly as an atom property available through the - ``atom.GetPropsAsDict()`` method for the others. + :meth:`~rdkit.Chem.rdchem.Atom.GetProp` methods for the others. Supported attributes: @@ -196,12 +197,28 @@ class RDKitConverter(base.ConverterBase): Example ------- - .. code-block:: python + To access MDAnalysis properties:: + + >>> import MDAnalysis as mda + >>> from MDAnalysis.tests.datafiles import PDB_full + >>> u = mda.Universe(PDB_full) + >>> mol = u.select_atoms('resname DMS').convert_to('RDKIT') + >>> mol.GetAtomWithIdx(0).GetMonomerInfo().GetResidueName() + 'DMS' + + To create a molecule for each frame of a trajectory:: - import MDAnalysis as mda - from MDAnalysis.tests.datafiles import PDB_full - u = mda.Universe(PDB_full) - mol = u.select_atoms('resname DMS').convert_to('RDKIT') + from MDAnalysisTests.datafiles import PSF, DCD + from rdkit.Chem.Descriptors3D import Asphericity + + u = mda.Universe(PSF, DCD) + elements = mda.topology.guessers.guess_types(u.atoms.names) + u.add_TopologyAttr('elements', elements) + ag = u.select_atoms("resid 1-10") + + for ts in u.trajectory: + mol = ag.convert_to("RDKIT") + x = Asphericity(mol) Notes @@ -215,7 +232,7 @@ class RDKitConverter(base.ConverterBase): If both ``tempfactors`` and ``bfactors`` attributes are present, the conversion will fail, since only one of these should be present. - TODO: Wait for Issue #1901 for a solution + Refer to Issue #1901 for a solution Hydrogens should be explicit in the topology file. If this is not the case, use the parameter ``NoImplicit=False`` when using the converter to allow @@ -231,17 +248,16 @@ class RDKitConverter(base.ConverterBase): is finished. Instead, users should do this in two steps by first saving the selection in a variable and then converting the saved AtomGroup. It also means that ``ag.convert_to("RDKIT")`` followed by - ``ag.convert_to("RDKIT", NoImplicit=True)`` will not use the cache. + ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the cache. Finally if you're modifying the AtomGroup in place between two conversions, the id of the AtomGroup won't change and thus the converter will use the cached molecule. For this reason, you can pass a ``cache=False`` argument to the converter to bypass the caching system. - The cached molecule doesn't contain the coordinates of the atoms. + Note that the cached molecule doesn't contain the coordinates of the atoms. .. versionadded:: 2.0.0 - .. _RDKit: https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol """ lib = 'RDKIT' @@ -250,7 +266,7 @@ class RDKitConverter(base.ConverterBase): def convert(self, obj, cache=True, NoImplicit=True): """Write selection at current trajectory frame to - :class:`rdkit.Chem.rdchem.Mol`. + :class:`~rdkit.Chem.rdchem.Mol`. Parameters ----------- @@ -415,7 +431,7 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): def _add_mda_attr_to_rdkit(attr, value, mi): """Converts an MDAnalysis atom attribute into the RDKit equivalent and - stores it into an RDKit AtomPDBResidueInfo object. + stores it into an RDKit :class:`~rdkit.Chem.rdchem.AtomPDBResidueInfo`. Parameters ---------- @@ -423,7 +439,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): Name of the atom attribute in MDAnalysis in the singular form value : object, np.int or np.float Attribute value as found in the AtomGroup - mi : :class:`rdkit.Chem.rdchem.AtomPDBResidueInfo` + mi : rdkit.Chem.rdchem.AtomPDBResidueInfo MonomerInfo object that will store the relevant atom attributes """ if isinstance(value, np.generic): @@ -468,7 +484,7 @@ def _infer_bo_and_charges(mol): Parameters ---------- - mol : :class:`rdkit.Chem.rdchem.RWMol` + mol : rdkit.Chem.rdchem.RWMol The molecule is modified inplace and must have all hydrogens added Notes @@ -532,11 +548,21 @@ def _infer_bo_and_charges(mol): def _standardize_patterns(mol): """Standardizes functional groups - Uses :func:`~_rebuild_conjugated_bonds` to standardize conjugated systems, + Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, and SMARTS reactions for other functional groups. Due to the way reactions work, we first have to split the molecule by fragments. Then, for each fragment, we apply the standardization reactions. Finally, the fragments are recombined. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to standardize + + Returns + ------- + mol : rdkit.Chem.rdchem.Mol + The standardized molecule """ # standardize conjugated systems @@ -581,12 +607,13 @@ def _run_reaction(reaction, reactant): ---------- reaction : str SMARTS reaction - reactant : :class:`rdkit.Chem.rdchem.RWMol` + reactant : rdkit.Chem.rdchem.Mol The molecule to transform Returns ------- - Final product of the reaction, as an :class:`rdkit.Chem.rdchem.RWMol` + product : rdkit.Chem.rdchem.Mol + The final product of the reaction """ # count how many times the reaction should be run pattern = Chem.MolFromSmarts(reaction.split(">>")[0]) @@ -622,13 +649,13 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): usual alternating single and double bonds. This function corrects this behaviour by using an iterative procedure. The problematic molecules always follow the same pattern: - `anion(-*=*)n-anion` instead of `*=(*-*=)n*`, where `n` is the number of - successive single and double bonds. The goal of the iterative procedure is - to make `n` as small as possible by consecutively transforming - `anion-*=*` into `*=*-anion` until it reaches the smallest pattern with - `n=1`. This last pattern is then transformed from `anion-*=*-anion` to - `*=*-*=*`. - Since `anion-*=*` is the same as `*=*-anion` in terms of SMARTS, we can + ``anion(-*=*)n-anion`` instead of ``*=(*-*=)n*``, where ``n`` is the number + of successive single and double bonds. The goal of the iterative procedure + is to make ``n`` as small as possible by consecutively transforming + ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with + ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to + ``*=*-*=*``. + Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can control that we don't transform the same triplet of atoms back and forth by adding their indices to a list. The molecule needs to be kekulized first to also cover systems @@ -636,8 +663,8 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): Parameters ---------- - mol : :class:`rdkit.Chem.rdchem.RWMol` - The molecule to transform + mol : rdkit.Chem.rdchem.RWMol + The molecule to transform, modified inplace max_iter : int Maximum number of iterations performed by the function """ diff --git a/package/doc/sphinx/source/conf.py b/package/doc/sphinx/source/conf.py index 296b479d6f9..a149c769a8f 100644 --- a/package/doc/sphinx/source/conf.py +++ b/package/doc/sphinx/source/conf.py @@ -346,4 +346,5 @@ 'https://gsd.readthedocs.io/en/stable/': None, 'https://parmed.github.io/ParmEd/html/': None, 'https://docs.h5py.org/en/stable': None, + 'https://www.rdkit.org/docs/': None, } From ebd04168295f84fce082d93b91d3762cde33f555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 Aug 2020 20:02:14 +0200 Subject: [PATCH 78/90] add test nan in coordinates --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index d4041de4c54..7b7e9521a25 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -314,6 +314,15 @@ def test_trajectory_coords(self): indices = [a.GetIdx() for a in indices] assert_equal(positions[indices], ts.positions) + def test_nan_coords(self): + u = mda.Universe.from_smiles("CCO") + xyz = u.atoms.positions + xyz[0][2] = np.nan + u.atoms.positions = xyz + mol = u.atoms.convert_to("RDKIT") + with pytest.raises(ValueError, match="Bad Conformer Id"): + mol.GetConformer() + def test_cache(self): u = mda.Universe.from_smiles("CCO", numConfs=5) ag = u.atoms From a1aab5b3212515007c61e61438102773ce74557c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 Aug 2020 21:09:02 +0200 Subject: [PATCH 79/90] warn nan coordinates --- package/MDAnalysis/coordinates/RDKit.py | 26 +++++++++++-------- .../MDAnalysisTests/coordinates/test_rdkit.py | 3 ++- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 580ae6687d6..fe0b61f8519 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -315,17 +315,21 @@ def convert(self, obj, cache=True, NoImplicit=True): mol = self.atomgroup_to_mol(ag, **kwargs) # add a conformer for the current Timestep - if hasattr(ag, "positions") and not np.isnan(ag.positions).any(): - # assign coordinates - conf = Chem.Conformer(mol.GetNumAtoms()) - for atom in mol.GetAtoms(): - idx = atom.GetIntProp("_MDAnalysis_index") - xyz = ag.positions[idx].astype(float) - conf.SetAtomPosition(atom.GetIdx(), xyz) - mol.AddConformer(conf) - # assign R/S to atoms and Z/E to bonds - Chem.AssignStereochemistryFrom3D(mol) - Chem.SetDoubleBondNeighborDirections(mol) + if hasattr(ag, "positions"): + if np.isnan(ag.positions).any(): + warnings.warn("NaN detected in coordinates, the output " + "molecule will not have 3D coordinates assigned") + else: + # assign coordinates + conf = Chem.Conformer(mol.GetNumAtoms()) + for atom in mol.GetAtoms(): + idx = atom.GetIntProp("_MDAnalysis_index") + xyz = ag.positions[idx].astype(float) + conf.SetAtomPosition(atom.GetIdx(), xyz) + mol.AddConformer(conf) + # assign R/S to atoms and Z/E to bonds + Chem.AssignStereochemistryFrom3D(mol) + Chem.SetDoubleBondNeighborDirections(mol) return mol diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 7b7e9521a25..9ccb53dd7d3 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -319,7 +319,8 @@ def test_nan_coords(self): xyz = u.atoms.positions xyz[0][2] = np.nan u.atoms.positions = xyz - mol = u.atoms.convert_to("RDKIT") + with pytest.warns(UserWarning, match="NaN detected"): + mol = u.atoms.convert_to("RDKIT") with pytest.raises(ValueError, match="Bad Conformer Id"): mol.GetConformer() From dcb8f33b11378829997cbaf6589b84b48e3cae7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 13 Aug 2020 22:20:48 +0200 Subject: [PATCH 80/90] end of code review --- package/MDAnalysis/coordinates/RDKit.py | 61 +++++++++++++++---- .../MDAnalysisTests/coordinates/test_rdkit.py | 18 +++--- 2 files changed, 58 insertions(+), 21 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index fe0b61f8519..dd90e88868c 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -387,7 +387,7 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): for i, (atom, element) in enumerate(zip(ag, elements)): # create atom rdatom = Chem.Atom(element.capitalize()) - # disable adding H to the molecule + # enable/disable adding implicit H to the molecule rdatom.SetNoImplicit(NoImplicit) # add PDB-like properties mi = Chem.AtomPDBResidueInfo() @@ -478,7 +478,7 @@ def _infer_bo_and_charges(mol): Since most MD topology files don't explicitly retain information on bond orders or charges, it has to be guessed from the topology. This is done by - looping other each atom and comparing its expected valence to the current + looping over each atom and comparing its expected valence to the current valence to get the Number of Unpaired Electrons (NUE). If an atom has a negative NUE, it needs a positive formal charge (-NUE). If two neighbouring atoms have UEs, the bond between them most @@ -500,9 +500,7 @@ def _infer_bo_and_charges(mol): for atom in mol.GetAtoms(): # get NUE for each possible valence - expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) - current_v = atom.GetTotalValence() - atom.GetFormalCharge() - nue = [v - current_v for v in expected_vs] + nue = _get_nb_unpaired_electrons(atom) # if there's only one possible valence state and the corresponding # NUE is negative, it means we can only add a positive charge to # the atom @@ -517,10 +515,7 @@ def _infer_bo_and_charges(mol): # check if one of the neighbors has a common NUE for i, na in enumerate(neighbors, start=1): # get NUE for the neighbor - na_expected_vs = PERIODIC_TABLE.GetValenceList( - na.GetAtomicNum()) - na_current_v = na.GetTotalValence() - na.GetFormalCharge() - na_nue = [v - na_current_v for v in na_expected_vs] + na_nue = _get_nb_unpaired_electrons(na) # smallest common NUE common_nue = min( min([i for i in nue if i >= 0], default=0), @@ -536,12 +531,10 @@ def _infer_bo_and_charges(mol): mol.UpdatePropertyCache(strict=False) if i < len(neighbors): # recalculate nue for atom - current_v = atom.GetTotalValence() - nue = [v - current_v for v in expected_vs] + nue = _get_nb_unpaired_electrons(atom) # if the atom still has unpaired electrons - current_v = atom.GetTotalValence() - atom.GetFormalCharge() - nue = [v - current_v for v in expected_vs][0] + nue = _get_nb_unpaired_electrons(atom)[0] if nue > 0: # transform it to a negative charge atom.SetFormalCharge(-nue) @@ -549,6 +542,24 @@ def _infer_bo_and_charges(mol): mol.UpdatePropertyCache(strict=False) +def _get_nb_unpaired_electrons(atom): + """Calculate the number of unpaired electrons (NUE) of an atom + + Parameters + ---------- + atom: rdkit.Chem.rdchem.Atom + The atom for which the NUE will be computed + + Returns + ------- + nue : list + The NUE for each possible valence of the atom + """ + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() - atom.GetFormalCharge() + return [v - current_v for v in expected_vs] + + def _standardize_patterns(mol): """Standardizes functional groups @@ -558,6 +569,30 @@ def _standardize_patterns(mol): fragments. Then, for each fragment, we apply the standardization reactions. Finally, the fragments are recombined. + Notes + ----- + The following functional groups are transformed: + + +--------------+----------------------------------------------------------------------------+ + | Name | Reaction | + +==============+============================================================================+ + | Cterm | [C-;X2:1]=[O:2]>>[C;+0:1]=[O:2] | + +--------------+----------------------------------------------------------------------------+ + | Nterm | [N-;X2;H1:1]>>[N;+0:1] | + +--------------+----------------------------------------------------------------------------+ + | keto-enolate | [C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3] | + +--------------+----------------------------------------------------------------------------+ + | arginine | [N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C;+0:2](-[N:3])=[N;+1:4] | + +--------------+----------------------------------------------------------------------------+ + | sulfone | [S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O;+0:2])=[O;+0:3] | + +--------------+----------------------------------------------------------------------------+ + | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N;+1:1](-[O;-1:2])=[O;+0:3] | + +--------------+----------------------------------------------------------------------------+ + | conjugated | [*-:1]-[*:2]=[*:3]-[*-:4]>>[*;+0:1]=[*:2]-[*:3]=[*;+0:4] | + +--------------+----------------------------------------------------------------------------+ + | conjugated-N | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N;+1:1]=[*:2]-[*:3]=[*;+0:4] | + +--------------+----------------------------------------------------------------------------+ + Parameters ---------- mol : rdkit.Chem.rdchem.RWMol diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 9ccb53dd7d3..afee8bf5e1b 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -173,6 +173,7 @@ def test_single_atom_mol(self, smi): generate_coordinates=False) mol = u.atoms.convert_to("RDKIT") assert mol.GetNumAtoms() == 1 + assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") @pytest.mark.parametrize("resname, n_atoms, n_fragments", [ ("PRO", 14, 1), @@ -344,14 +345,15 @@ def test_cache(self): mol = u.atoms.convert_to("RDKIT") assert len(cache) == 1 assert cache != previous_cache - # TODO: uncomment once the converters API accepts arguments - # # cache should depend on passed arguments - # previous_cache = copy.deepcopy(cache) - # mol = u.atoms.convert_to("RDKIT", NoImplicit=False) - # assert cache != previous_cache - # # skip cache - # mol = u.atoms.convert_to("RDKIT", cache=False) - # assert cache == {} + # converter with kwargs + rdkit_converter = mda.coordinates.RDKit.RDKitConverter().convert + # cache should depend on passed arguments + previous_cache = copy.deepcopy(cache) + mol = rdkit_converter(u.atoms, NoImplicit=False) + assert cache != previous_cache + # skip cache + mol = rdkit_converter(u.atoms, cache=False) + assert cache == {} @requires_rdkit From 47b27551cb5a050cac9dc54001c88f7e882b724e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 02:23:52 +0200 Subject: [PATCH 81/90] corrections to smarts patterns for reactions --- package/MDAnalysis/coordinates/RDKit.py | 54 ++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index dd90e88868c..8a96ce0895b 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -573,25 +573,25 @@ def _standardize_patterns(mol): ----- The following functional groups are transformed: - +--------------+----------------------------------------------------------------------------+ - | Name | Reaction | - +==============+============================================================================+ - | Cterm | [C-;X2:1]=[O:2]>>[C;+0:1]=[O:2] | - +--------------+----------------------------------------------------------------------------+ - | Nterm | [N-;X2;H1:1]>>[N;+0:1] | - +--------------+----------------------------------------------------------------------------+ - | keto-enolate | [C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3] | - +--------------+----------------------------------------------------------------------------+ - | arginine | [N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C;+0:2](-[N:3])=[N;+1:4] | - +--------------+----------------------------------------------------------------------------+ - | sulfone | [S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O;+0:2])=[O;+0:3] | - +--------------+----------------------------------------------------------------------------+ - | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N;+1:1](-[O;-1:2])=[O;+0:3] | - +--------------+----------------------------------------------------------------------------+ - | conjugated | [*-:1]-[*:2]=[*:3]-[*-:4]>>[*;+0:1]=[*:2]-[*:3]=[*;+0:4] | - +--------------+----------------------------------------------------------------------------+ - | conjugated-N | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N;+1:1]=[*:2]-[*:3]=[*;+0:4] | - +--------------+----------------------------------------------------------------------------+ + +--------------+-------------------------------------------------------------------------+ + | Name | Reaction | + +==============+=========================================================================+ + | Cterm | [C-;X2:1]=[O:2]>>[C+0:1]=[O:2] | + +--------------+-------------------------------------------------------------------------+ + | Nterm | [N-;X2;H1:1]>>[N+0:1] | + +--------------+-------------------------------------------------------------------------+ + | keto-enolate | [#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3] | + +--------------+-------------------------------------------------------------------------+ + | arginine | [N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4] | + +--------------+-------------------------------------------------------------------------+ + | sulfone | [S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3] | + +--------------+-------------------------------------------------------------------------+ + | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3] | + +--------------+-------------------------------------------------------------------------+ + | conjugated | [*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4] | + +--------------+-------------------------------------------------------------------------+ + | conjugated-N | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4] | + +--------------+-------------------------------------------------------------------------+ Parameters ---------- @@ -611,15 +611,15 @@ def _standardize_patterns(mol): for reactant in Chem.GetMolFrags(mol, asMols=True): for name, reaction in [ - ("Cterm", "[C-;X2:1]=[O:2]>>[C;+0:1]=[O:2]"), - ("Nterm", "[N-;X2;H1:1]>>[N;+0:1]"), - ("keto-enolate", "[C-:1]-[C:2]=[O:3]>>[C;+0:1]=[C:2]-[O;-1:3]"), + ("Cterm", "[C-;X2:1]=[O:2]>>[C+0:1]=[O:2]"), + ("Nterm", "[N-;X2;H1:1]>>[N+0:1]"), + ("keto-enolate", "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]"), ("ARG", "[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]" - ">>[N:1]-[C;+0:2](-[N:3])=[N;+1:4]"), + ">>[N:1]-[C+0:2](-[N:3])=[N+:4]"), ("sulfone", "[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]" - ">>[S:1](=[O;+0:2])=[O;+0:3]"), + ">>[S:1](=[O+0:2])=[O+0:3]"), ("nitro", "[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]" - ">>[N;+1:1](-[O;-1:2])=[O;+0:3]"), + ">>[N+:1](-[O-:2])=[O+0:3]"), ]: reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) @@ -688,7 +688,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): usual alternating single and double bonds. This function corrects this behaviour by using an iterative procedure. The problematic molecules always follow the same pattern: - ``anion(-*=*)n-anion`` instead of ``*=(*-*=)n*``, where ``n`` is the number + ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number of successive single and double bonds. The goal of the iterative procedure is to make ``n`` as small as possible by consecutively transforming ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with @@ -709,7 +709,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): """ mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) - pattern = Chem.MolFromSmarts("[*-]-[*;+0]=[*;+0;!O]") + pattern = Chem.MolFromSmarts("[*-]-[*+0]=[*+0]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) From 89efcd3891755126eb1cb35852ea83b95cf78a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 02:27:40 +0200 Subject: [PATCH 82/90] sort atoms by NUE + more standardization tests --- package/MDAnalysis/coordinates/RDKit.py | 6 ++++-- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 8 +++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 8a96ce0895b..a5780c11d32 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -498,7 +498,8 @@ def _infer_bo_and_charges(mol): one will be charged. It will also affect more complex conjugated systems. """ - for atom in mol.GetAtoms(): + for atom in sorted(mol.GetAtoms(), reverse=True, + key=lambda a: _get_nb_unpaired_electrons(a)[0]): # get NUE for each possible valence nue = _get_nb_unpaired_electrons(atom) # if there's only one possible valence state and the corresponding @@ -511,7 +512,8 @@ def _infer_bo_and_charges(mol): if (len(nue) == 1) and (nue[0] <= 0): continue else: - neighbors = atom.GetNeighbors() + neighbors = sorted(atom.GetNeighbors(), reverse=True, + key=lambda a: _get_nb_unpaired_electrons(a)[0]) # check if one of the neighbors has a common NUE for i, na in enumerate(neighbors, start=1): # get NUE for the neighbor diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index afee8bf5e1b..4afb53d2a9b 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -409,6 +409,12 @@ def test_infer_charges(self, smi, atom_idx, charge): ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), ("[C-](=O)-C", "[C](=O)-C"), ("[H]-[N-]-C", "[H]-[N]-C"), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", + "[O-]c1ccccc1"), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", + "[O-]C1=CC=CC1[O-]"), + ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), + ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), ]) def test_standardize_patterns(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) @@ -474,7 +480,6 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "C=CC=CC=CC=CC=CC=C", "NCCCCC([NH3+])C(=O)[O-]", "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", - ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders @@ -482,6 +487,7 @@ def test_order_independant(self, smi_in): template = Chem.AddHs(ref) for atom in template.GetAtoms(): atom.SetIsAromatic(False) + atom.SetFormalCharge(0) for bond in template.GetBonds(): bond.SetIsAromatic(False) bond.SetBondType(Chem.BondType.SINGLE) From a595e69f3e65dd80cccaff2e5d8fd22429d7df78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 02:28:14 +0200 Subject: [PATCH 83/90] pdb atom names formating --- package/MDAnalysis/coordinates/RDKit.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index a5780c11d32..dbe0fc96d18 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -455,9 +455,14 @@ def _add_mda_attr_to_rdkit(attr, value, mi): name = re.findall(r'(\D+|\d+)', value) if len(name) == 2: symbol, number = name + if len(number) > 2 and len(symbol) == 1: + value = "{}{}".format(symbol, number) + else: + value = "{:>2.2}{:<2.2}".format(symbol, number) else: - symbol, number = name[0], "" - value = "{:>2.2}{:<2.2}".format(symbol, number) + # no number in the name + value = " {:<}".format(name[0]) + # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] getattr(mi, "Set%s" % rdattr)(value) From f394e7226c84bb7508a6edfef4a10ed3bd5b9366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 02:29:29 +0200 Subject: [PATCH 84/90] test RDKit->MDA->RDKit->MDA equal --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index 4afb53d2a9b..f19d648c9ce 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -212,6 +212,11 @@ def test_identical_topology(self, rdmol): u = mda.Universe(rdmol) umol = u.atoms.convert_to("RDKIT") assert rdmol.HasSubstructMatch(umol) and umol.HasSubstructMatch(rdmol) + u2 = mda.Universe(umol) + assert_equal(u.atoms.bonds, u2.atoms.bonds) + assert_equal(u.atoms.elements, u2.atoms.elements) + assert_equal(u.atoms.names, u2.atoms.names) + assert_almost_equal(u.atoms.positions, u2.atoms.positions, decimal=7) def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) @@ -295,7 +300,7 @@ def test_assign_coordinates(self, pdb): indices = sorted(mol.GetAtoms(), key=lambda a: a.GetIntProp("_MDAnalysis_index")) indices = [a.GetIdx() for a in indices] - assert_equal(positions[indices], pdb.atoms.positions) + assert_almost_equal(positions[indices], pdb.atoms.positions) def test_assign_stereochemistry(self, mol2): umol = mol2.atoms.convert_to("RDKIT") @@ -313,7 +318,7 @@ def test_trajectory_coords(self): indices = sorted(mol.GetAtoms(), key=lambda a: a.GetIntProp("_MDAnalysis_index")) indices = [a.GetIdx() for a in indices] - assert_equal(positions[indices], ts.positions) + assert_almost_equal(positions[indices], ts.positions) def test_nan_coords(self): u = mda.Universe.from_smiles("CCO") From e525e41517df3ebfa824a6c412065a3e75bc0a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 15:21:54 +0200 Subject: [PATCH 85/90] tackle conjugated systems ending with O- --- package/MDAnalysis/coordinates/RDKit.py | 34 +++++++++++++++++-------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index dbe0fc96d18..5c991f9486e 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -716,7 +716,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): """ mol.UpdatePropertyCache(strict=False) Chem.Kekulize(mol) - pattern = Chem.MolFromSmarts("[*-]-[*+0]=[*+0]") + pattern = Chem.MolFromSmarts("[*-;!O]-[*+0]=[*+0]") # number of unique matches with the pattern n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) @@ -725,13 +725,13 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): return # check if there's an even number of anion-*=* patterns elif n_matches % 2 == 0: - end_pattern = Chem.MolFromSmarts("[*-]-[*]=[*]-[*-]") - end_charge = 0 + end_pattern = Chem.MolFromSmarts("[*-;!O]-[*+0]=[*+0]-[*-]") else: - # the only way to standardize is to find a nitrogen that can accept - # a double bond and a positive charge - end_pattern = Chem.MolFromSmarts("[*-]-[*]=[*]-[N;X3;v3]") - end_charge = 1 + # as a last resort, the only way to standardize is to find a nitrogen + # that can accept a double bond and a positive charge + # or a carbonyl that will become an enolate + end_pattern = Chem.MolFromSmarts( + "[*-;!O]-[*+0]=[*+0]-[$([#7;X3;v3]),$([#6+0]=O)]") backtrack = [] for _ in range(max_iter): # simplest case where n=1 @@ -739,10 +739,24 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): if end_match: # index of each atom anion1, a1, a2, anion2 = end_match - # charges + term_atom = mol.GetAtomWithIdx(anion2) + # [*-]-*=*-C=O + if term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0: + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if neighbor.GetAtomicNum() == 8 and bond.GetBondTypeAsDouble() == 2: + bond.SetBondType(Chem.BondType.SINGLE) + neighbor.SetFormalCharge(-1) + else: + # [*-]-*=*-N + if term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 0: + end_charge = 1 + # [*-]-*=*-[*-] + else: + end_charge = 0 + mol.GetAtomWithIdx(anion2).SetFormalCharge(end_charge) + # common part of the conjugated systems: [*-]-*=* mol.GetAtomWithIdx(anion1).SetFormalCharge(0) - mol.GetAtomWithIdx(anion2).SetFormalCharge(end_charge) - # bonds mol.GetBondBetweenAtoms(anion1, a1).SetBondType( Chem.BondType.DOUBLE) mol.GetBondBetweenAtoms(a1, a2).SetBondType(Chem.BondType.SINGLE) From 9e22681dbca9873b6f39788787e84366cbad4761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 15:27:55 +0200 Subject: [PATCH 86/90] docs + pep8 --- package/MDAnalysis/coordinates/RDKit.py | 46 +++++++++++++------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 5c991f9486e..7e1dc47074e 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -150,7 +150,7 @@ class RDKitConverter(base.ConverterBase): :class:`~MDAnalysis.core.universe.Universe` to RDKit :class:`~rdkit.Chem.rdchem.Mol` - MDanalysis attributes are stored in each RDKit + MDanalysis attributes are stored in each RDKit :class:`~rdkit.Chem.rdchem.Atom` of the resulting molecule in two different ways: @@ -462,7 +462,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi): else: # no number in the name value = " {:<}".format(name[0]) - + # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] getattr(mi, "Set%s" % rdattr)(value) @@ -580,25 +580,27 @@ def _standardize_patterns(mol): ----- The following functional groups are transformed: - +--------------+-------------------------------------------------------------------------+ - | Name | Reaction | - +==============+=========================================================================+ - | Cterm | [C-;X2:1]=[O:2]>>[C+0:1]=[O:2] | - +--------------+-------------------------------------------------------------------------+ - | Nterm | [N-;X2;H1:1]>>[N+0:1] | - +--------------+-------------------------------------------------------------------------+ - | keto-enolate | [#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3] | - +--------------+-------------------------------------------------------------------------+ - | arginine | [N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4] | - +--------------+-------------------------------------------------------------------------+ - | sulfone | [S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3] | - +--------------+-------------------------------------------------------------------------+ - | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3] | - +--------------+-------------------------------------------------------------------------+ - | conjugated | [*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4] | - +--------------+-------------------------------------------------------------------------+ - | conjugated-N | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4] | - +--------------+-------------------------------------------------------------------------+ + +---------------+-------------------------------------------------------------------------+ + | Name | Reaction | + +===============+=========================================================================+ + | Cterm | [C-;X2:1]=[O:2]>>[C+0:1]=[O:2] | + +---------------+-------------------------------------------------------------------------+ + | Nterm | [N-;X2;H1:1]>>[N+0:1] | + +---------------+-------------------------------------------------------------------------+ + | keto-enolate | [#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3] | + +---------------+-------------------------------------------------------------------------+ + | arginine | [N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4] | + +---------------+-------------------------------------------------------------------------+ + | sulfone | [S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3] | + +---------------+-------------------------------------------------------------------------+ + | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3] | + +---------------+-------------------------------------------------------------------------+ + | conjugated | [*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4] | + +---------------+-------------------------------------------------------------------------+ + | conjugated-N+ | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4] | + +---------------+-------------------------------------------------------------------------+ + | conjugated-O- | [O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5] | + +---------------+-------------------------------------------------------------------------+ Parameters ---------- @@ -694,7 +696,7 @@ def _rebuild_conjugated_bonds(mol, max_iter=200): a double bond less and both edges of the system as anions instead of the usual alternating single and double bonds. This function corrects this behaviour by using an iterative procedure. - The problematic molecules always follow the same pattern: + The problematic molecules always follow the same pattern: ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number of successive single and double bonds. The goal of the iterative procedure is to make ``n`` as small as possible by consecutively transforming From eaf68876a81f0263717956d96516abb82b996640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 15:28:22 +0200 Subject: [PATCH 87/90] tests --- testsuite/MDAnalysisTests/coordinates/test_rdkit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py index f19d648c9ce..930e65fa730 100644 --- a/testsuite/MDAnalysisTests/coordinates/test_rdkit.py +++ b/testsuite/MDAnalysisTests/coordinates/test_rdkit.py @@ -417,7 +417,7 @@ def test_infer_charges(self, smi, atom_idx, charge): ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", "[O-]c1ccccc1"), ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", - "[O-]C1=CC=CC1[O-]"), + "[O-]C1=CC=CC1=O"), ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), ]) @@ -485,6 +485,7 @@ def test_reassign_props_after_reaction(self, rdmol, product, name): "C=CC=CC=CC=CC=CC=C", "NCCCCC([NH3+])C(=O)[O-]", "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", ]) def test_order_independant(self, smi_in): # generate mol with hydrogens but without bond orders From ded60c016e3b1f60bb9c6bbe4c8f55bdc9a62f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 15:36:38 +0200 Subject: [PATCH 88/90] add max_iter kwarg --- package/MDAnalysis/coordinates/RDKit.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 7e1dc47074e..183bd52f5c5 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -264,7 +264,7 @@ class RDKitConverter(base.ConverterBase): units = {'time': None, 'length': 'Angstrom'} _cache = dict() - def convert(self, obj, cache=True, NoImplicit=True): + def convert(self, obj, cache=True, NoImplicit=True, max_iter=200): """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. @@ -279,9 +279,12 @@ def convert(self, obj, cache=True, NoImplicit=True): to the converter (with the exception of this `cache` argument) NoImplicit : bool Prevent adding hydrogens to the molecule + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :func:`_rebuild_conjugated_bonds` """ # parameters passed to atomgroup_to_mol and used by the cache - kwargs = dict(NoImplicit=NoImplicit) + kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter) try: from rdkit import Chem @@ -333,14 +336,18 @@ def convert(self, obj, cache=True, NoImplicit=True): return mol - def atomgroup_to_mol(self, ag, NoImplicit=True): + def atomgroup_to_mol(self, ag, NoImplicit=True, max_iter=200): """Converts an AtomGroup to an RDKit molecule. Parameters ----------- - ag : :class:`~MDAnalysis.core.groups.AtomGroup` + ag : MDAnalysis.core.groups.AtomGroup + The AtomGroup to convert NoImplicit : bool Prevent adding hydrogens to the molecule + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :func:`_rebuild_conjugated_bonds` """ try: elements = ag.elements @@ -425,7 +432,7 @@ def atomgroup_to_mol(self, ag, NoImplicit=True): if NoImplicit: # infer bond orders and formal charges from the connectivity _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol) + mol = _standardize_patterns(mol, max_iter) # sanitize Chem.SanitizeMol(mol) @@ -567,7 +574,7 @@ def _get_nb_unpaired_electrons(atom): return [v - current_v for v in expected_vs] -def _standardize_patterns(mol): +def _standardize_patterns(mol, max_iter=200): """Standardizes functional groups Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, @@ -606,6 +613,8 @@ def _standardize_patterns(mol): ---------- mol : rdkit.Chem.rdchem.RWMol The molecule to standardize + max_iter : int + Maximum number of iterations to standardize conjugated systems Returns ------- @@ -614,7 +623,7 @@ def _standardize_patterns(mol): """ # standardize conjugated systems - _rebuild_conjugated_bonds(mol) + _rebuild_conjugated_bonds(mol, max_iter) fragments = [] for reactant in Chem.GetMolFrags(mol, asMols=True): From 4ed048385511fa6ddcc9be07f60bf9b43f86a6ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 17:20:25 +0200 Subject: [PATCH 89/90] reorder docstring standardize_patterns --- package/MDAnalysis/coordinates/RDKit.py | 37 +++++++++++++------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 183bd52f5c5..1de9b95e329 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -583,13 +583,31 @@ def _standardize_patterns(mol, max_iter=200): fragments. Then, for each fragment, we apply the standardization reactions. Finally, the fragments are recombined. + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to standardize + max_iter : int + Maximum number of iterations to standardize conjugated systems + + Returns + ------- + mol : rdkit.Chem.rdchem.Mol + The standardized molecule + Notes ----- - The following functional groups are transformed: + The following functional groups are transformed in this order: +---------------+-------------------------------------------------------------------------+ | Name | Reaction | +===============+=========================================================================+ + | conjugated | [*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4] | + +---------------+-------------------------------------------------------------------------+ + | conjugated-N+ | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4] | + +---------------+-------------------------------------------------------------------------+ + | conjugated-O- | [O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5] | + +---------------+-------------------------------------------------------------------------+ | Cterm | [C-;X2:1]=[O:2]>>[C+0:1]=[O:2] | +---------------+-------------------------------------------------------------------------+ | Nterm | [N-;X2;H1:1]>>[N+0:1] | @@ -602,24 +620,7 @@ def _standardize_patterns(mol, max_iter=200): +---------------+-------------------------------------------------------------------------+ | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3] | +---------------+-------------------------------------------------------------------------+ - | conjugated | [*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4] | - +---------------+-------------------------------------------------------------------------+ - | conjugated-N+ | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4] | - +---------------+-------------------------------------------------------------------------+ - | conjugated-O- | [O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5] | - +---------------+-------------------------------------------------------------------------+ - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to standardize - max_iter : int - Maximum number of iterations to standardize conjugated systems - - Returns - ------- - mol : rdkit.Chem.rdchem.Mol - The standardized molecule """ # standardize conjugated systems From 4e49616720feb27bf4609279e5e8d05968be9864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 14 Aug 2020 19:28:44 +0200 Subject: [PATCH 90/90] fix doc build --- package/MDAnalysis/coordinates/RDKit.py | 67 +++++++++++++------------ 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/package/MDAnalysis/coordinates/RDKit.py b/package/MDAnalysis/coordinates/RDKit.py index 1de9b95e329..54184371912 100644 --- a/package/MDAnalysis/coordinates/RDKit.py +++ b/package/MDAnalysis/coordinates/RDKit.py @@ -24,7 +24,7 @@ """RDKit molecule I/O --- :mod:`MDAnalysis.coordinates.RDKit` ================================================================ -Read coordinates data from an `RDKit `_ :class:`rdkit.Chem.rdchem.Mol` with +Read coordinates data from an `RDKit `__ :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` into an MDAnalysis Universe. Convert it back to a :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. @@ -32,16 +32,19 @@ Example ------- ->>> from rdkit import Chem ->>> import MDAnalysis as mda ->>> mol = Chem.MolFromMol2File("docking_poses.mol2", removeHs=False) ->>> u = mda.Universe(mol) ->>> u - ->>> u.trajectory - ->>> u.atoms.convert_to("RDKIT") - +To read an RDKit molecule and then convert the AtomGroup back to an RDKit +molecule:: + + >>> from rdkit import Chem + >>> import MDAnalysis as mda + >>> mol = Chem.MolFromMol2File("docking_poses.mol2", removeHs=False) + >>> u = mda.Universe(mol) + >>> u + + >>> u.trajectory + + >>> u.atoms.convert_to("RDKIT") + Classes @@ -599,27 +602,27 @@ def _standardize_patterns(mol, max_iter=200): ----- The following functional groups are transformed in this order: - +---------------+-------------------------------------------------------------------------+ - | Name | Reaction | - +===============+=========================================================================+ - | conjugated | [*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4] | - +---------------+-------------------------------------------------------------------------+ - | conjugated-N+ | [N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4] | - +---------------+-------------------------------------------------------------------------+ - | conjugated-O- | [O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5] | - +---------------+-------------------------------------------------------------------------+ - | Cterm | [C-;X2:1]=[O:2]>>[C+0:1]=[O:2] | - +---------------+-------------------------------------------------------------------------+ - | Nterm | [N-;X2;H1:1]>>[N+0:1] | - +---------------+-------------------------------------------------------------------------+ - | keto-enolate | [#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3] | - +---------------+-------------------------------------------------------------------------+ - | arginine | [N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4] | - +---------------+-------------------------------------------------------------------------+ - | sulfone | [S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3] | - +---------------+-------------------------------------------------------------------------+ - | nitro | [N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3] | - +---------------+-------------------------------------------------------------------------+ + +---------------+-----------------------------------------------------------------------------+ + | Name | Reaction | + +===============+=============================================================================+ + | conjugated | ``[*-;!O:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+-----------------------------------------------------------------------------+ + | conjugated-N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+-----------------------------------------------------------------------------+ + | conjugated-O- | ``[O:1]=[#6:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+-----------------------------------------------------------------------------+ + | Cterm | ``[C-;X2:1]=[O:2]>>[C+0:1]=[O:2]`` | + +---------------+-----------------------------------------------------------------------------+ + | Nterm | ``[N-;X2;H1:1]>>[N+0:1]`` | + +---------------+-----------------------------------------------------------------------------+ + | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | + +---------------+-----------------------------------------------------------------------------+ + | arginine | ``[N;H1:1]-[C-;X3;H0:2](-[N;H2:3])-[N;H2:4]>>[N:1]-[C+0:2](-[N:3])=[N+:4]`` | + +---------------+-----------------------------------------------------------------------------+ + | sulfone | ``[S;X4;v4:1](-[O-;X1:2])-[O-;X1:3]>>[S:1](=[O+0:2])=[O+0:3]`` | + +---------------+-----------------------------------------------------------------------------+ + | nitro | ``[N;X3;v3:1](-[O-;X1:2])-[O-;X1:3]>>[N+:1](-[O-:2])=[O+0:3]`` | + +---------------+-----------------------------------------------------------------------------+ """