From 9db9db0bddb2942afc17c78aa3ade4b6462f9531 Mon Sep 17 00:00:00 2001 From: Cedric Bouysset Date: Fri, 29 Sep 2023 11:39:21 +0100 Subject: [PATCH 01/14] refactor: move inferring code to separate module --- package/MDAnalysis/converters/RDKit.py | 427 +---------------- .../MDAnalysis/converters/RDKitInferring.py | 430 ++++++++++++++++++ .../MDAnalysisTests/converters/test_rdkit.py | 5 +- 3 files changed, 437 insertions(+), 425 deletions(-) create mode 100644 package/MDAnalysis/converters/RDKitInferring.py diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 139528440ab..89421f14560 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -96,19 +96,12 @@ try: from rdkit import Chem - from rdkit.Chem import AllChem + + from .RDKitInferring import ( + RDBONDORDER, _infer_bo_and_charges, _standardize_patterns) except ImportError: pass else: - RDBONDORDER = { - 1: Chem.BondType.SINGLE, - 1.5: Chem.BondType.AROMATIC, - "ar": Chem.BondType.AROMATIC, - 2: Chem.BondType.DOUBLE, - 3: Chem.BondType.TRIPLE, - } - # add string version of the key for each bond - RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) RDATTRIBUTES = { "altLocs": "AltLoc", "chainIDs": "ChainId", @@ -120,31 +113,7 @@ "segindices": "SegmentNumber", "tempfactors": "TempFactor", } - PERIODIC_TABLE = Chem.GetPeriodicTable() - - -# charges that should be assigned to monatomic cations -# structure --> atomic number : formal charge -# anion charges are directly handled by the code using the typical valence -# of the atom -MONATOMIC_CATION_CHARGES = { - 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, - 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, - 26: 2, # Fe could also be +3 - 13: 3, -} -# reactions uses by _standardize_patterns to fix challenging cases -# must have single reactant and product, and cannot add any atom -STANDARDIZATION_REACTIONS = [ - "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm - "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm - "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate - "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG - "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" - ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP - "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone - "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N -] + _deduce_PDB_atom_name = PDBWriter(StringIO())._deduce_PDB_atom_name @@ -589,391 +558,3 @@ def _ignore_prop(atom, attr, value): def _set_atom_property(atom, attr, value): """Saves any attribute and value into an RDKit atom property""" _atom_property_dispatcher.get(type(value), _ignore_prop)(atom, attr, value) - - -def _atom_sorter(atom): - """Sorts atoms in the molecule in a way that makes it easy for the bond - order and charge infering code to get the correct state on the first - try. Currently sorts by number of unpaired electrons, then by number of - heavy atom neighbors (i.e. atoms at the edge first).""" - num_heavy_neighbors = len([ - neighbor for neighbor in atom.GetNeighbors() - if neighbor.GetAtomicNum() > 1] - ) - return (-_get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) - - -def _infer_bo_and_charges(mol): - """Infer bond orders and formal charges from a molecule. - - Since most MD topology files don't explicitly retain information on bond - orders or charges, it has to be guessed from the topology. This is done by - looping over each atom and comparing its expected valence to the current - valence to get the Number of Unpaired Electrons (NUE). - If an atom has a negative NUE, it needs a positive formal charge (-NUE). - If two neighbouring atoms have UEs, the bond between them most - likely has to be increased by the value of the smallest NUE. - If after this process, an atom still has UEs, it needs a negative formal - charge of -NUE. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule is modified inplace and must have all hydrogens added - - Notes - ----- - This algorithm is order dependant. For example, for a carboxylate group - R-C(-O)-O the first oxygen read will receive a double bond and the other - one will be charged. It will also affect more complex conjugated systems. - """ - # heavy atoms sorted by number of heavy atom neighbors (lower first) then - # NUE (higher first) - atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], - key=_atom_sorter) - - for atom in atoms: - # monatomic ions - if atom.GetDegree() == 0: - atom.SetFormalCharge(MONATOMIC_CATION_CHARGES.get( - atom.GetAtomicNum(), - -_get_nb_unpaired_electrons(atom)[0])) - mol.UpdatePropertyCache(strict=False) - continue - # get NUE for each possible valence - nue = _get_nb_unpaired_electrons(atom) - # if there's only one possible valence state and the corresponding - # NUE is negative, it means we can only add a positive charge to - # the atom - if (len(nue) == 1) and (nue[0] < 0): - atom.SetFormalCharge(-nue[0]) - mol.UpdatePropertyCache(strict=False) - # go to next atom if above case or atom has no unpaired electron - if (len(nue) == 1) and (nue[0] <= 0): - continue - else: - neighbors = sorted(atom.GetNeighbors(), reverse=True, - key=lambda a: _get_nb_unpaired_electrons(a)[0]) - # check if one of the neighbors has a common NUE - for na in neighbors: - # get NUE for the neighbor - na_nue = _get_nb_unpaired_electrons(na) - # smallest common NUE - common_nue = min( - min([i for i in nue if i >= 0], default=0), - min([i for i in na_nue if i >= 0], default=0) - ) - # a common NUE of 0 means we don't need to do anything - if common_nue != 0: - # increase bond order - bond = mol.GetBondBetweenAtoms( - atom.GetIdx(), na.GetIdx()) - order = common_nue + 1 - bond.SetBondType(RDBONDORDER[order]) - mol.UpdatePropertyCache(strict=False) - # go to next atom if one of the valences is complete - nue = _get_nb_unpaired_electrons(atom) - if any([n == 0 for n in nue]): - break - - # if atom valence is still not filled - nue = _get_nb_unpaired_electrons(atom) - if not any([n == 0 for n in nue]): - # transform nue to charge - atom.SetFormalCharge(-nue[0]) - atom.SetNumRadicalElectrons(0) - mol.UpdatePropertyCache(strict=False) - - -def _get_nb_unpaired_electrons(atom): - """Calculate the number of unpaired electrons (NUE) of an atom - - Parameters - ---------- - atom: rdkit.Chem.rdchem.Atom - The atom for which the NUE will be computed - - Returns - ------- - nue : list - The NUE for each possible valence of the atom - """ - expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) - current_v = atom.GetTotalValence() - atom.GetFormalCharge() - return [v - current_v for v in expected_vs] - - -def _standardize_patterns(mol, max_iter=200): - """Standardizes functional groups - - Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, - and SMARTS reactions for other functional groups. - Due to the way reactions work, we first have to split the molecule by - fragments. Then, for each fragment, we apply the standardization reactions. - Finally, the fragments are recombined. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to standardize - max_iter : int - Maximum number of iterations to standardize conjugated systems - - Returns - ------- - mol : rdkit.Chem.rdchem.Mol - The standardized molecule - - Notes - ----- - The following functional groups are transformed in this order: - - +---------------+------------------------------------------------------------------------------+ - | Name | Reaction | - +===============+==============================================================================+ - | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | - +---------------+------------------------------------------------------------------------------+ - | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | - +---------------+------------------------------------------------------------------------------+ - | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | - +---------------+------------------------------------------------------------------------------+ - | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | - +---------------+------------------------------------------------------------------------------+ - | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | - +---------------+------------------------------------------------------------------------------+ - | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | - +---------------+------------------------------------------------------------------------------+ - | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | - +---------------+------------------------------------------------------------------------------+ - | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | - +---------------+------------------------------------------------------------------------------+ - | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| - +---------------+------------------------------------------------------------------------------+ - | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | - +---------------+------------------------------------------------------------------------------+ - | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | - +---------------+------------------------------------------------------------------------------+ - - """ - - # standardize conjugated systems - _rebuild_conjugated_bonds(mol, max_iter) - - # list of sanitized reactions - reactions = [] - for rxn in STANDARDIZATION_REACTIONS: - reaction = AllChem.ReactionFromSmarts(rxn) - reactions.append(reaction) - - # fragment mol (reactions must have single reactant and product) - fragments = list(Chem.GetMolFrags(mol, asMols=True)) - for reactant in fragments: - _apply_reactions(reactions, reactant) - - # recombine fragments - newmol = fragments.pop(0) - for fragment in fragments: - newmol = Chem.CombineMols(newmol, fragment) - - return newmol - - -def _apply_reactions(reactions, reactant): - """Applies a series of unimolecular reactions to a molecule. The reactions - must not add any atom to the product. The molecule is modified inplace. - - Parameters - ---------- - reactions : List[rdkit.Chem.rdChemReactions.ChemicalReaction] - Reactions from SMARTS. Each reaction is applied until no more - transformations can be made. - reactant : rdkit.Chem.rdchem.Mol - The molecule to transform inplace - - """ - reactant.UpdatePropertyCache(strict=False) - Chem.Kekulize(reactant) - for reaction in reactions: - while reaction.RunReactantInPlace(reactant): - reactant.UpdatePropertyCache(strict=False) - reactant.UpdatePropertyCache(strict=False) - Chem.Kekulize(reactant) - - -def _rebuild_conjugated_bonds(mol, max_iter=200): - """Rebuild conjugated bonds without negatively charged atoms at the - beginning and end of the conjugated system - - Depending on the order in which atoms are read during the conversion, the - :func:`_infer_bo_and_charges` function might write conjugated systems with - a double bond less and both edges of the system as anions instead of the - usual alternating single and double bonds. This function corrects this - behaviour by using an iterative procedure. - The problematic molecules always follow the same pattern: - ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number - of successive single and double bonds. The goal of the iterative procedure - is to make ``n`` as small as possible by consecutively transforming - ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with - ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to - ``*=*-*=*``. - Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can - control that we don't transform the same triplet of atoms back and forth by - adding their indices to a list. - This function also handles conjugated systems with triple bonds. - The molecule needs to be kekulized first to also cover systems - with aromatic rings. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to transform, modified inplace - max_iter : int - Maximum number of iterations performed by the function - - Notes - ----- - The molecule is modified inplace - """ - mol.UpdatePropertyCache(strict=False) - Chem.Kekulize(mol) - # pattern used to find problematic conjugated bonds - # there's usually an even number of matches for this - pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") - # pattern used to finish fixing a series of conjugated bonds - base_end_pattern = Chem.MolFromSmarts( - "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") - # used when there's an odd number of matches for `pattern` - odd_end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," - "$([S;D4;v4]-[O-])]") - # number of unique matches with the pattern - n_matches = len(set([match[0] - for match in mol.GetSubstructMatches(pattern)])) - # nothing to standardize - if n_matches == 0: - return - # single match (unusual) - elif n_matches == 1: - # as a last resort, the only way to standardize is to find a nitrogen - # that can accept a double bond and a positive charge - # or a carbonyl that will become an enolate - end_pattern = odd_end_pattern - # at least 2 matches - else: - # give priority to base end pattern and then deal with the odd one if - # necessary - end_pattern = base_end_pattern - backtrack = [] - backtrack_cycles = 0 - for _ in range(max_iter): - # check for ending pattern - end_match = mol.GetSubstructMatch(end_pattern) - if end_match: - # index of each atom - anion1, a1, a2, anion2 = end_match - term_atom = mol.GetAtomWithIdx(anion2) - # edge-case 1: C-[O-] or [N+]-[O-] - # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] - # transform the =O to -[O-] - if ( - term_atom.GetAtomicNum() == 6 - and term_atom.GetFormalCharge() == 0 - ) or ( - term_atom.GetAtomicNum() == 7 - and term_atom.GetFormalCharge() == 1 - ): - for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - bond.GetBondTypeAsDouble() == 2): - bond.SetBondType(Chem.BondType.SINGLE) - neighbor.SetFormalCharge(-1) - break - # edge-case 2: S=O - # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O - # transform -[O-] to =O - elif (term_atom.GetAtomicNum() == 16 and - term_atom.GetFormalCharge() == 0): - for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - neighbor.GetFormalCharge() == -1 and - bond.GetBondTypeAsDouble() == 1): - bond.SetBondType(Chem.BondType.DOUBLE) - neighbor.SetFormalCharge(0) - break - # not an edge case: - # increment the charge - else: - term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) - # common to all cases: - # [*-]-*=*-[R] --> *=*-*=[R] - # increment the charge and switch single and double bonds - a = mol.GetAtomWithIdx(anion1) - a.SetFormalCharge(a.GetFormalCharge() + 1) - b = mol.GetBondBetweenAtoms(anion1, a1) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - b = mol.GetBondBetweenAtoms(a1, a2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) - b = mol.GetBondBetweenAtoms(a2, anion2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - mol.UpdatePropertyCache(strict=False) - continue - - # switch the position of the charge and the double bond - matches = mol.GetSubstructMatches(pattern) - if matches: - # check if we haven't already transformed this triplet - for match in matches: - # store order-independent atom indices - g = set(match) - # already transformed --> try the next one - if g in backtrack: - continue - # add to backtracking and start the switch - else: - anion, a1, a2 = match - backtrack.append(g) - break - # already performed all changes - else: - if backtrack_cycles == 1: - # might be stuck because there were 2 "odd" end patterns - # misqualified as a single base one - end_pattern = odd_end_pattern - elif backtrack_cycles > 1: - # likely stuck changing the same bonds over and over so - # let's stop here - mol.UpdatePropertyCache(strict=False) - return - match = matches[0] - anion, a1, a2 = match - backtrack = [set(match)] - backtrack_cycles += 1 - - # switch charges - a = mol.GetAtomWithIdx(anion) - a.SetFormalCharge(a.GetFormalCharge() + 1) - a = mol.GetAtomWithIdx(a2) - a.SetFormalCharge(a.GetFormalCharge() - 1) - # switch bond orders - b = mol.GetBondBetweenAtoms(anion, a1) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - b = mol.GetBondBetweenAtoms(a1, a2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) - mol.UpdatePropertyCache(strict=False) - # update number of matches for the end pattern - n_matches = len(set([match[0] for match in matches])) - if n_matches == 1: - end_pattern = odd_end_pattern - # start new iteration - continue - - # no more changes to apply - mol.UpdatePropertyCache(strict=False) - return - - # reached max_iter - warnings.warn("The standardization could not be completed within a " - "reasonable number of iterations") diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py new file mode 100644 index 00000000000..9fa27989abd --- /dev/null +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -0,0 +1,430 @@ +import warnings + +try: + from rdkit import Chem + from rdkit.Chem import AllChem +except ImportError: + pass +else: + RDBONDORDER = { + 1: Chem.BondType.SINGLE, + 1.5: Chem.BondType.AROMATIC, + "ar": Chem.BondType.AROMATIC, + 2: Chem.BondType.DOUBLE, + 3: Chem.BondType.TRIPLE, + } + # add string version of the key for each bond + RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) + PERIODIC_TABLE = Chem.GetPeriodicTable() + + +# charges that should be assigned to monatomic cations +# structure --> atomic number : formal charge +# anion charges are directly handled by the code using the typical valence +# of the atom +MONATOMIC_CATION_CHARGES = { + 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, + 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, + 26: 2, # Fe could also be +3 + 13: 3, +} +# reactions uses by _standardize_patterns to fix challenging cases +# must have single reactant and product, and cannot add any atom +STANDARDIZATION_REACTIONS = [ + "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm + "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm + "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate + "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG + "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" + ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP + "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone + "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N +] + + +def _atom_sorter(atom): + """Sorts atoms in the molecule in a way that makes it easy for the bond + order and charge infering code to get the correct state on the first + try. Currently sorts by number of unpaired electrons, then by number of + heavy atom neighbors (i.e. atoms at the edge first).""" + num_heavy_neighbors = len([ + neighbor for neighbor in atom.GetNeighbors() + if neighbor.GetAtomicNum() > 1] + ) + return (-_get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) + + +def _infer_bo_and_charges(mol): + """Infer bond orders and formal charges from a molecule. + + Since most MD topology files don't explicitly retain information on bond + orders or charges, it has to be guessed from the topology. This is done by + looping over each atom and comparing its expected valence to the current + valence to get the Number of Unpaired Electrons (NUE). + If an atom has a negative NUE, it needs a positive formal charge (-NUE). + If two neighbouring atoms have UEs, the bond between them most + likely has to be increased by the value of the smallest NUE. + If after this process, an atom still has UEs, it needs a negative formal + charge of -NUE. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule is modified inplace and must have all hydrogens added + + Notes + ----- + This algorithm is order dependant. For example, for a carboxylate group + R-C(-O)-O the first oxygen read will receive a double bond and the other + one will be charged. It will also affect more complex conjugated systems. + """ + # heavy atoms sorted by number of heavy atom neighbors (lower first) then + # NUE (higher first) + atoms = sorted([atom for atom in mol.GetAtoms() + if atom.GetAtomicNum() > 1], + key=_atom_sorter) + + for atom in atoms: + # monatomic ions + if atom.GetDegree() == 0: + atom.SetFormalCharge(MONATOMIC_CATION_CHARGES.get( + atom.GetAtomicNum(), + -_get_nb_unpaired_electrons(atom)[0])) + mol.UpdatePropertyCache(strict=False) + continue + # get NUE for each possible valence + nue = _get_nb_unpaired_electrons(atom) + # if there's only one possible valence state and the corresponding + # NUE is negative, it means we can only add a positive charge to + # the atom + if (len(nue) == 1) and (nue[0] < 0): + atom.SetFormalCharge(-nue[0]) + mol.UpdatePropertyCache(strict=False) + # go to next atom if above case or atom has no unpaired electron + if (len(nue) == 1) and (nue[0] <= 0): + continue + else: + neighbors = sorted(atom.GetNeighbors(), reverse=True, + key=lambda a: _get_nb_unpaired_electrons(a)[0]) + # check if one of the neighbors has a common NUE + for na in neighbors: + # get NUE for the neighbor + na_nue = _get_nb_unpaired_electrons(na) + # smallest common NUE + common_nue = min( + min([i for i in nue if i >= 0], default=0), + min([i for i in na_nue if i >= 0], default=0) + ) + # a common NUE of 0 means we don't need to do anything + if common_nue != 0: + # increase bond order + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx()) + order = common_nue + 1 + bond.SetBondType(RDBONDORDER[order]) + mol.UpdatePropertyCache(strict=False) + # go to next atom if one of the valences is complete + nue = _get_nb_unpaired_electrons(atom) + if any([n == 0 for n in nue]): + break + + # if atom valence is still not filled + nue = _get_nb_unpaired_electrons(atom) + if not any([n == 0 for n in nue]): + # transform nue to charge + atom.SetFormalCharge(-nue[0]) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) + + +def _get_nb_unpaired_electrons(atom): + """Calculate the number of unpaired electrons (NUE) of an atom + + Parameters + ---------- + atom: rdkit.Chem.rdchem.Atom + The atom for which the NUE will be computed + + Returns + ------- + nue : list + The NUE for each possible valence of the atom + """ + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() - atom.GetFormalCharge() + return [v - current_v for v in expected_vs] + + +def _standardize_patterns(mol, max_iter=200): + """Standardizes functional groups + + Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, + and SMARTS reactions for other functional groups. + Due to the way reactions work, we first have to split the molecule by + fragments. Then, for each fragment, we apply the standardization reactions. + Finally, the fragments are recombined. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to standardize + max_iter : int + Maximum number of iterations to standardize conjugated systems + + Returns + ------- + mol : rdkit.Chem.rdchem.Mol + The standardized molecule + + Notes + ----- + The following functional groups are transformed in this order: + + +---------------+------------------------------------------------------------------------------+ + | Name | Reaction | + +===============+==============================================================================+ + | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+------------------------------------------------------------------------------+ + | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+------------------------------------------------------------------------------+ + | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | + +---------------+------------------------------------------------------------------------------+ + | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | + +---------------+------------------------------------------------------------------------------+ + | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | + +---------------+------------------------------------------------------------------------------+ + | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | + +---------------+------------------------------------------------------------------------------+ + | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| + +---------------+------------------------------------------------------------------------------+ + | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | + +---------------+------------------------------------------------------------------------------+ + | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | + +---------------+------------------------------------------------------------------------------+ + + """ + + # standardize conjugated systems + _rebuild_conjugated_bonds(mol, max_iter) + + # list of sanitized reactions + reactions = [] + for rxn in STANDARDIZATION_REACTIONS: + reaction = AllChem.ReactionFromSmarts(rxn) + reactions.append(reaction) + + # fragment mol (reactions must have single reactant and product) + fragments = list(Chem.GetMolFrags(mol, asMols=True)) + for reactant in fragments: + _apply_reactions(reactions, reactant) + + # recombine fragments + newmol = fragments.pop(0) + for fragment in fragments: + newmol = Chem.CombineMols(newmol, fragment) + + return newmol + + +def _apply_reactions(reactions, reactant): + """Applies a series of unimolecular reactions to a molecule. The reactions + must not add any atom to the product. The molecule is modified inplace. + + Parameters + ---------- + reactions : List[rdkit.Chem.rdChemReactions.ChemicalReaction] + Reactions from SMARTS. Each reaction is applied until no more + transformations can be made. + reactant : rdkit.Chem.rdchem.Mol + The molecule to transform inplace + + """ + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + for reaction in reactions: + while reaction.RunReactantInPlace(reactant): + reactant.UpdatePropertyCache(strict=False) + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + + +def _rebuild_conjugated_bonds(mol, max_iter=200): + """Rebuild conjugated bonds without negatively charged atoms at the + beginning and end of the conjugated system + + Depending on the order in which atoms are read during the conversion, the + :func:`_infer_bo_and_charges` function might write conjugated systems with + a double bond less and both edges of the system as anions instead of the + usual alternating single and double bonds. This function corrects this + behaviour by using an iterative procedure. + The problematic molecules always follow the same pattern: + ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number + of successive single and double bonds. The goal of the iterative procedure + is to make ``n`` as small as possible by consecutively transforming + ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with + ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to + ``*=*-*=*``. + Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can + control that we don't transform the same triplet of atoms back and forth by + adding their indices to a list. + This function also handles conjugated systems with triple bonds. + The molecule needs to be kekulized first to also cover systems + with aromatic rings. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to transform, modified inplace + max_iter : int + Maximum number of iterations performed by the function + + Notes + ----- + The molecule is modified inplace + """ + mol.UpdatePropertyCache(strict=False) + Chem.Kekulize(mol) + # pattern used to find problematic conjugated bonds + # there's usually an even number of matches for this + pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") + # pattern used to finish fixing a series of conjugated bonds + base_end_pattern = Chem.MolFromSmarts( + "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") + # used when there's an odd number of matches for `pattern` + odd_end_pattern = Chem.MolFromSmarts( + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," + "$([S;D4;v4]-[O-])]") + # number of unique matches with the pattern + n_matches = len(set([match[0] + for match in mol.GetSubstructMatches(pattern)])) + # nothing to standardize + if n_matches == 0: + return + # single match (unusual) + elif n_matches == 1: + # as a last resort, the only way to standardize is to find a nitrogen + # that can accept a double bond and a positive charge + # or a carbonyl that will become an enolate + end_pattern = odd_end_pattern + # at least 2 matches + else: + # give priority to base end pattern and then deal with the odd one if + # necessary + end_pattern = base_end_pattern + backtrack = [] + backtrack_cycles = 0 + for _ in range(max_iter): + # check for ending pattern + end_match = mol.GetSubstructMatch(end_pattern) + if end_match: + # index of each atom + anion1, a1, a2, anion2 = end_match + term_atom = mol.GetAtomWithIdx(anion2) + # edge-case 1: C-[O-] or [N+]-[O-] + # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] + # transform the =O to -[O-] + if ( + term_atom.GetAtomicNum() == 6 + and term_atom.GetFormalCharge() == 0 + ) or ( + term_atom.GetAtomicNum() == 7 + and term_atom.GetFormalCharge() == 1 + ): + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if (neighbor.GetAtomicNum() == 8 and + bond.GetBondTypeAsDouble() == 2): + bond.SetBondType(Chem.BondType.SINGLE) + neighbor.SetFormalCharge(-1) + break + # edge-case 2: S=O + # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O + # transform -[O-] to =O + elif (term_atom.GetAtomicNum() == 16 and + term_atom.GetFormalCharge() == 0): + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if (neighbor.GetAtomicNum() == 8 and + neighbor.GetFormalCharge() == -1 and + bond.GetBondTypeAsDouble() == 1): + bond.SetBondType(Chem.BondType.DOUBLE) + neighbor.SetFormalCharge(0) + break + # not an edge case: + # increment the charge + else: + term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) + # common to all cases: + # [*-]-*=*-[R] --> *=*-*=[R] + # increment the charge and switch single and double bonds + a = mol.GetAtomWithIdx(anion1) + a.SetFormalCharge(a.GetFormalCharge() + 1) + b = mol.GetBondBetweenAtoms(anion1, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + b = mol.GetBondBetweenAtoms(a2, anion2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + mol.UpdatePropertyCache(strict=False) + continue + + # switch the position of the charge and the double bond + matches = mol.GetSubstructMatches(pattern) + if matches: + # check if we haven't already transformed this triplet + for match in matches: + # store order-independent atom indices + g = set(match) + # already transformed --> try the next one + if g in backtrack: + continue + # add to backtracking and start the switch + else: + anion, a1, a2 = match + backtrack.append(g) + break + # already performed all changes + else: + if backtrack_cycles == 1: + # might be stuck because there were 2 "odd" end patterns + # misqualified as a single base one + end_pattern = odd_end_pattern + elif backtrack_cycles > 1: + # likely stuck changing the same bonds over and over so + # let's stop here + mol.UpdatePropertyCache(strict=False) + return + match = matches[0] + anion, a1, a2 = match + backtrack = [set(match)] + backtrack_cycles += 1 + + # switch charges + a = mol.GetAtomWithIdx(anion) + a.SetFormalCharge(a.GetFormalCharge() + 1) + a = mol.GetAtomWithIdx(a2) + a.SetFormalCharge(a.GetFormalCharge() - 1) + # switch bond orders + b = mol.GetBondBetweenAtoms(anion, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + mol.UpdatePropertyCache(strict=False) + # update number of matches for the end pattern + n_matches = len(set([match[0] for match in matches])) + if n_matches == 1: + end_pattern = odd_end_pattern + # start new iteration + continue + + # no more changes to apply + mol.UpdatePropertyCache(strict=False) + return + + # reached max_iter + warnings.warn("The standardization could not be completed within a " + "reasonable number of iterations") diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 85860162f7b..6139f962f9e 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -36,11 +36,12 @@ try: from MDAnalysis.converters.RDKit import (RDATTRIBUTES, _add_mda_attr_to_rdkit, - _atom_sorter, _infer_bo_and_charges, - _rebuild_conjugated_bonds, _set_atom_property, _standardize_patterns) + from MDAnalysis.converters.RDKitInferring import ( + _atom_sorter, _rebuild_conjugated_bonds) + from rdkit import Chem from rdkit.Chem import AllChem except ImportError: From 835ea4f8f13f43ca50c62b4fdf17fb56ef4c9542 Mon Sep 17 00:00:00 2001 From: Cedric Bouysset Date: Fri, 29 Sep 2023 16:29:49 +0100 Subject: [PATCH 02/14] add docs stub --- .../doc/sphinx/source/documentation_pages/converters/RDKit.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst b/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst index 760446fd1d5..fa4525f69b6 100644 --- a/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst +++ b/package/doc/sphinx/source/documentation_pages/converters/RDKit.rst @@ -1,3 +1,5 @@ .. automodule:: MDAnalysis.converters.RDKitParser .. automodule:: MDAnalysis.converters.RDKit + +.. automodule:: MDAnalysis.converters.RDKitInferring From 7390f962c4ba0a18fe77d762d5d17af3580178e7 Mon Sep 17 00:00:00 2001 From: Cedric Bouysset Date: Fri, 29 Sep 2023 16:32:14 +0100 Subject: [PATCH 03/14] refactor as a class --- package/MDAnalysis/converters/RDKit.py | 68 +- .../MDAnalysis/converters/RDKitInferring.py | 891 ++++++++++-------- .../MDAnalysisTests/converters/test_rdkit.py | 44 +- 3 files changed, 557 insertions(+), 446 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 89421f14560..119ce055c92 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -24,7 +24,8 @@ """RDKit molecule I/O --- :mod:`MDAnalysis.converters.RDKit` ================================================================ -Read coordinates data from an `RDKit `__ :class:`rdkit.Chem.rdchem.Mol` with +Read coordinates data from an `RDKit `__ +:class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitReader` into an MDAnalysis Universe. Convert it back to a :class:`rdkit.Chem.rdchem.Mol` with :class:`RDKitConverter`. @@ -98,7 +99,7 @@ from rdkit import Chem from .RDKitInferring import ( - RDBONDORDER, _infer_bo_and_charges, _standardize_patterns) + RDBONDORDER, MDAnalysisInferer) except ImportError: pass else: @@ -113,6 +114,10 @@ "segindices": "SegmentNumber", "tempfactors": "TempFactor", } + DEFAULT_INFERER = MDAnalysisInferer() + # only here for backwards compatibility + _infer_bo_and_charges = DEFAULT_INFERER._infer_bo_and_charges + _standardize_patterns = DEFAULT_INFERER._standardize_patterns _deduce_PDB_atom_name = PDBWriter(StringIO())._deduce_PDB_atom_name @@ -287,24 +292,28 @@ class RDKitConverter(base.ConverterBase): lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} - def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, - force=False): + def convert(self, obj, cache=True, NoImplicit=True, max_iter=None, + force=False, inferer=DEFAULT_INFERER): """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. Parameters ----------- - obj : :class:`~MDAnalysis.core.groups.AtomGroup` or :class:`~MDAnalysis.core.universe.Universe` + obj : :class:`~MDAnalysis.core.groups.AtomGroup` or + :class:`~MDAnalysis.core.universe.Universe` cache : bool Use a cached copy of the molecule's topology when available. To be used, the cached molecule and the new one have to be made from the same AtomGroup selection and with the same arguments passed to the converter + inferer : Optional[Callable[[Chem.Mol], Chem.Mol]] + A callable to infer bond orders and charges for the RDKit molecule created + by the converter. If ``None``, inferring is skipped. NoImplicit : bool - Prevent adding hydrogens to the molecule + Prevent adding hydrogens to the molecule. max_iter : int - Maximum number of iterations to standardize conjugated systems. - See :func:`_rebuild_conjugated_bonds` + Deprecated, use `MDAnalysisInferer(max_iter=...)` instead. Maximum number + of iterations to standardize conjugated systems. force : bool Force the conversion when no hydrogens were detected but ``NoImplicit=True``. Useful for inorganic molecules mostly. @@ -324,8 +333,14 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, "please use a valid AtomGroup or Universe".format( type(obj))) from None + if max_iter is not None: + warnings.warn( + "Using `max_iter` is deprecated, use `MDAnalysisInferer(max_iter=...)` " + "instead", DeprecationWarning) + if isinstance(inferer, MDAnalysisInferer): + inferer = MDAnalysisInferer(max_iter=max_iter) # parameters passed to atomgroup_to_mol - kwargs = dict(NoImplicit=NoImplicit, max_iter=max_iter, force=force) + kwargs = dict(NoImplicit=NoImplicit, force=force, inferer=inferer) if cache: mol = atomgroup_to_mol(ag, **kwargs) mol = copy.deepcopy(mol) @@ -353,7 +368,7 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=200, @lru_cache(maxsize=2) -def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): +def atomgroup_to_mol(ag, NoImplicit=True, force=False, inferer=DEFAULT_INFERER): """Converts an AtomGroup to an RDKit molecule without coordinates. Parameters @@ -363,12 +378,12 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): NoImplicit : bool Prevent adding hydrogens to the molecule and allow bond orders and formal charges to be guessed from the valence of each atom. - max_iter : int - Maximum number of iterations to standardize conjugated systems. - See :func:`_rebuild_conjugated_bonds` force : bool Force the conversion when no hydrogens were detected but ``NoImplicit=True``. Mostly useful for inorganic molecules. + inferer : Optional[Callable[[Chem.Mol], Chem.Mol]] + A callable to infer bond orders and charges for the RDKit molecule created + by the converter. If ``None``, inferring is skipped. """ try: elements = ag.elements @@ -385,14 +400,13 @@ def atomgroup_to_mol(ag, NoImplicit=True, max_iter=200, force=False): "No hydrogen atom found in the topology. " "Forcing to continue the conversion." ) - elif NoImplicit: + elif inferer is not None: raise AttributeError( "No hydrogen atom could be found in the topology, but the " "converter requires all hydrogens to be explicit. You can use " - "the parameter ``NoImplicit=False`` when using the converter " - "to allow implicit hydrogens and disable inferring bond " - "orders and charges. You can also use ``force=True`` to " - "ignore this error.") + "the parameter ``inferer=None`` when using the converter " + "to disable inferring bond orders and charges. You can also use " + "``force=True`` to ignore this error.") # attributes accepted in PDBResidueInfo object pdb_attrs = {} @@ -454,21 +468,9 @@ def get_resname(idx): mol.UpdatePropertyCache(strict=False) - if NoImplicit: - # infer bond orders and formal charges from the connectivity - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol, max_iter) - # reorder atoms to match MDAnalysis, since the reactions from - # _standardize_patterns will mess up the original order - order = np.argsort([atom.GetIntProp("_MDAnalysis_index") - for atom in mol.GetAtoms()]) - mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) - - # sanitize if possible - err = Chem.SanitizeMol(mol, catchErrors=True) - if err: - warnings.warn("Could not sanitize molecule: " - f"failed during step {err!r}") + if inferer is not None and NoImplicit: + # infer bond orders and formal charges + mol = inferer(mol) return mol diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index 9fa27989abd..e51922e2766 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -1,4 +1,41 @@ +# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*- +# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 +# +# MDAnalysis --- https://www.mdanalysis.org +# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors +# (see the file AUTHORS for the full list of names) +# +# Released under the GNU Public Licence, v2 or any higher version +# +# Please cite your use of MDAnalysis in published work: +# +# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler, +# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein. +# MDAnalysis: A Python package for the rapid analysis of molecular dynamics +# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th +# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy. +# doi: 10.25080/majora-629e541a-00e +# +# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein. +# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations. +# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787 +# + +"""RDKit bond order inferring --- :mod:`MDAnalysis.converters.RDKitInferring` +============================================================================= +Bond order and formal charge inferring for RDKit molecules. + +Classes +------- + +.. autoclass:: MDAnalysisInferer + :members: +""" import warnings +from dataclasses import dataclass +from typing import ClassVar, Dict, List + +import numpy as np try: from rdkit import Chem @@ -18,413 +55,475 @@ PERIODIC_TABLE = Chem.GetPeriodicTable() -# charges that should be assigned to monatomic cations -# structure --> atomic number : formal charge -# anion charges are directly handled by the code using the typical valence -# of the atom -MONATOMIC_CATION_CHARGES = { - 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, - 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, - 26: 2, # Fe could also be +3 - 13: 3, -} -# reactions uses by _standardize_patterns to fix challenging cases -# must have single reactant and product, and cannot add any atom -STANDARDIZATION_REACTIONS = [ - "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm - "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm - "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate - "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG - "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" - ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP - "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone - "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N -] - - -def _atom_sorter(atom): - """Sorts atoms in the molecule in a way that makes it easy for the bond - order and charge infering code to get the correct state on the first - try. Currently sorts by number of unpaired electrons, then by number of - heavy atom neighbors (i.e. atoms at the edge first).""" - num_heavy_neighbors = len([ - neighbor for neighbor in atom.GetNeighbors() - if neighbor.GetAtomicNum() > 1] - ) - return (-_get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) - - -def _infer_bo_and_charges(mol): - """Infer bond orders and formal charges from a molecule. - - Since most MD topology files don't explicitly retain information on bond - orders or charges, it has to be guessed from the topology. This is done by - looping over each atom and comparing its expected valence to the current - valence to get the Number of Unpaired Electrons (NUE). - If an atom has a negative NUE, it needs a positive formal charge (-NUE). - If two neighbouring atoms have UEs, the bond between them most - likely has to be increased by the value of the smallest NUE. - If after this process, an atom still has UEs, it needs a negative formal - charge of -NUE. - - Parameters +@dataclass(kw_only=True, frozen=True) +class MDAnalysisInferer: + """Bond order and formal charge inferring as originally implemented for the RDKit + converter. + + Attributes ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule is modified inplace and must have all hydrogens added - - Notes - ----- - This algorithm is order dependant. For example, for a carboxylate group - R-C(-O)-O the first oxygen read will receive a double bond and the other - one will be charged. It will also affect more complex conjugated systems. + max_iter : int + Maximum number of iterations to standardize conjugated systems. + See :meth:`_rebuild_conjugated_bonds` + MONATOMIC_CATION_CHARGES : ClassVar[Dict[int, int]] + Charges that should be assigned to monatomic cations. Maps atomic number to + their formal charge. Anion charges are directly handled by the code using the + typical valence of the atom. + STANDARDIZATION_REACTIONS : ClassVar[List[str]] + Reactions uses by :meth:`_standardize_patterns` to fix challenging cases must + have single reactant and product, and cannot add any atom + + .. versionadded:: 2.7.0 """ - # heavy atoms sorted by number of heavy atom neighbors (lower first) then - # NUE (higher first) - atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], - key=_atom_sorter) - - for atom in atoms: - # monatomic ions - if atom.GetDegree() == 0: - atom.SetFormalCharge(MONATOMIC_CATION_CHARGES.get( - atom.GetAtomicNum(), - -_get_nb_unpaired_electrons(atom)[0])) - mol.UpdatePropertyCache(strict=False) - continue - # get NUE for each possible valence - nue = _get_nb_unpaired_electrons(atom) - # if there's only one possible valence state and the corresponding - # NUE is negative, it means we can only add a positive charge to - # the atom - if (len(nue) == 1) and (nue[0] < 0): - atom.SetFormalCharge(-nue[0]) - mol.UpdatePropertyCache(strict=False) - # go to next atom if above case or atom has no unpaired electron - if (len(nue) == 1) and (nue[0] <= 0): - continue + MONATOMIC_CATION_CHARGES: ClassVar[Dict[int, int]] = { + 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, + 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, + 26: 2, # Fe could also be +3 + 13: 3, + } + STANDARDIZATION_REACTIONS: ClassVar[List[str]] = [ + "[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]", # Cterm + "[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]", # Nterm + "[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]", # keto-enolate + "[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]", # ARG + "[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]" + ">>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]", # HIP + "[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]", # sulfone + "[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]", # charged-N + ] + + max_iter: int = 200 + + def __call__(self, mol): + """Infer bond orders and formal charges in the molecule. + + Parameters + ---------- + mol : Chem.Mol + The molecule to infer bond orders and charges for. + + Returns + ------- + A new molecule with proper bond orders and charges. The molecule needs to be + sanitized. + """ + self._infer_bo_and_charges(mol) + mol = self._standardize_patterns(mol) + mol = self._reorder_atoms(mol) + # sanitize if possible + err = Chem.SanitizeMol(mol, catchErrors=True) + if err: + warnings.warn("Could not sanitize molecule: " + f"failed during step {err!r}") + return mol + + def _reorder_atoms(self, mol): + """Reorder atoms to match MDAnalysis, since the reactions from + :meth:`_standardize_patterns` will mess up the original order. + """ + try: + order = np.argsort([atom.GetIntProp("_MDAnalysis_index") + for atom in mol.GetAtoms()]) + except KeyError: + # not a molecule converted by MDAnalysis + pass else: - neighbors = sorted(atom.GetNeighbors(), reverse=True, - key=lambda a: _get_nb_unpaired_electrons(a)[0]) - # check if one of the neighbors has a common NUE - for na in neighbors: - # get NUE for the neighbor - na_nue = _get_nb_unpaired_electrons(na) - # smallest common NUE - common_nue = min( - min([i for i in nue if i >= 0], default=0), - min([i for i in na_nue if i >= 0], default=0) - ) - # a common NUE of 0 means we don't need to do anything - if common_nue != 0: - # increase bond order - bond = mol.GetBondBetweenAtoms( - atom.GetIdx(), na.GetIdx()) - order = common_nue + 1 - bond.SetBondType(RDBONDORDER[order]) - mol.UpdatePropertyCache(strict=False) - # go to next atom if one of the valences is complete - nue = _get_nb_unpaired_electrons(atom) - if any([n == 0 for n in nue]): - break - - # if atom valence is still not filled - nue = _get_nb_unpaired_electrons(atom) - if not any([n == 0 for n in nue]): - # transform nue to charge + mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) + return mol + + @classmethod + def _atom_sorter(cls, atom): + """Sorts atoms in the molecule in a way that makes it easy for the bond + order and charge infering code to get the correct state on the first + try. Currently sorts by number of unpaired electrons, then by number of + heavy atom neighbors (i.e. atoms at the edge first).""" + num_heavy_neighbors = len([ + neighbor for neighbor in atom.GetNeighbors() + if neighbor.GetAtomicNum() > 1] + ) + return (-cls._get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) + + @classmethod + def _infer_bo_and_charges(cls, mol): + """Infer bond orders and formal charges from a molecule. + + Since most MD topology files don't explicitly retain information on bond + orders or charges, it has to be guessed from the topology. This is done by + looping over each atom and comparing its expected valence to the current + valence to get the Number of Unpaired Electrons (NUE). + If an atom has a negative NUE, it needs a positive formal charge (-NUE). + If two neighbouring atoms have UEs, the bond between them most + likely has to be increased by the value of the smallest NUE. + If after this process, an atom still has UEs, it needs a negative formal + charge of -NUE. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule is modified inplace and must have all hydrogens added + + Notes + ----- + This algorithm is order dependant. For example, for a carboxylate group + R-C(-O)-O the first oxygen read will receive a double bond and the other + one will be charged. It will also affect more complex conjugated systems. + """ + # heavy atoms sorted by number of heavy atom neighbors (lower first) then + # NUE (higher first) + atoms = sorted([atom for atom in mol.GetAtoms() + if atom.GetAtomicNum() > 1], + key=cls._atom_sorter) + + for atom in atoms: + # monatomic ions + if atom.GetDegree() == 0: + atom.SetFormalCharge(cls.MONATOMIC_CATION_CHARGES.get( + atom.GetAtomicNum(), + -cls._get_nb_unpaired_electrons(atom)[0])) + mol.UpdatePropertyCache(strict=False) + continue + # get NUE for each possible valence + nue = cls._get_nb_unpaired_electrons(atom) + # if there's only one possible valence state and the corresponding + # NUE is negative, it means we can only add a positive charge to + # the atom + if (len(nue) == 1) and (nue[0] < 0): atom.SetFormalCharge(-nue[0]) - atom.SetNumRadicalElectrons(0) mol.UpdatePropertyCache(strict=False) + # go to next atom if above case or atom has no unpaired electron + if (len(nue) == 1) and (nue[0] <= 0): + continue + else: + neighbors = sorted(atom.GetNeighbors(), reverse=True, + key=lambda a: cls._get_nb_unpaired_electrons(a)[0]) + # check if one of the neighbors has a common NUE + for na in neighbors: + # get NUE for the neighbor + na_nue = cls._get_nb_unpaired_electrons(na) + # smallest common NUE + common_nue = min( + min([i for i in nue if i >= 0], default=0), + min([i for i in na_nue if i >= 0], default=0) + ) + # a common NUE of 0 means we don't need to do anything + if common_nue != 0: + # increase bond order + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx()) + order = common_nue + 1 + bond.SetBondType(RDBONDORDER[order]) + mol.UpdatePropertyCache(strict=False) + # go to next atom if one of the valences is complete + nue = cls._get_nb_unpaired_electrons(atom) + if any([n == 0 for n in nue]): + break + + # if atom valence is still not filled + nue = cls._get_nb_unpaired_electrons(atom) + if not any([n == 0 for n in nue]): + # transform nue to charge + atom.SetFormalCharge(-nue[0]) + atom.SetNumRadicalElectrons(0) + mol.UpdatePropertyCache(strict=False) - -def _get_nb_unpaired_electrons(atom): - """Calculate the number of unpaired electrons (NUE) of an atom - - Parameters - ---------- - atom: rdkit.Chem.rdchem.Atom - The atom for which the NUE will be computed - - Returns - ------- - nue : list - The NUE for each possible valence of the atom - """ - expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) - current_v = atom.GetTotalValence() - atom.GetFormalCharge() - return [v - current_v for v in expected_vs] - - -def _standardize_patterns(mol, max_iter=200): - """Standardizes functional groups - - Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, - and SMARTS reactions for other functional groups. - Due to the way reactions work, we first have to split the molecule by - fragments. Then, for each fragment, we apply the standardization reactions. - Finally, the fragments are recombined. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to standardize - max_iter : int - Maximum number of iterations to standardize conjugated systems - - Returns - ------- - mol : rdkit.Chem.rdchem.Mol - The standardized molecule - - Notes - ----- - The following functional groups are transformed in this order: - - +---------------+------------------------------------------------------------------------------+ - | Name | Reaction | - +===============+==============================================================================+ - | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | - +---------------+------------------------------------------------------------------------------+ - | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | - +---------------+------------------------------------------------------------------------------+ - | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | - +---------------+------------------------------------------------------------------------------+ - | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | - +---------------+------------------------------------------------------------------------------+ - | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | - +---------------+------------------------------------------------------------------------------+ - | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | - +---------------+------------------------------------------------------------------------------+ - | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | - +---------------+------------------------------------------------------------------------------+ - | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | - +---------------+------------------------------------------------------------------------------+ - | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| - +---------------+------------------------------------------------------------------------------+ - | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | - +---------------+------------------------------------------------------------------------------+ - | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | - +---------------+------------------------------------------------------------------------------+ - - """ - - # standardize conjugated systems - _rebuild_conjugated_bonds(mol, max_iter) - - # list of sanitized reactions - reactions = [] - for rxn in STANDARDIZATION_REACTIONS: - reaction = AllChem.ReactionFromSmarts(rxn) - reactions.append(reaction) - - # fragment mol (reactions must have single reactant and product) - fragments = list(Chem.GetMolFrags(mol, asMols=True)) - for reactant in fragments: - _apply_reactions(reactions, reactant) - - # recombine fragments - newmol = fragments.pop(0) - for fragment in fragments: - newmol = Chem.CombineMols(newmol, fragment) - - return newmol - - -def _apply_reactions(reactions, reactant): - """Applies a series of unimolecular reactions to a molecule. The reactions - must not add any atom to the product. The molecule is modified inplace. - - Parameters - ---------- - reactions : List[rdkit.Chem.rdChemReactions.ChemicalReaction] - Reactions from SMARTS. Each reaction is applied until no more - transformations can be made. - reactant : rdkit.Chem.rdchem.Mol - The molecule to transform inplace - - """ - reactant.UpdatePropertyCache(strict=False) - Chem.Kekulize(reactant) - for reaction in reactions: - while reaction.RunReactantInPlace(reactant): - reactant.UpdatePropertyCache(strict=False) + @staticmethod + def _get_nb_unpaired_electrons(atom): + """Calculate the number of unpaired electrons (NUE) of an atom + + Parameters + ---------- + atom: rdkit.Chem.rdchem.Atom + The atom for which the NUE will be computed + + Returns + ------- + nue : list + The NUE for each possible valence of the atom + """ + expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) + current_v = atom.GetTotalValence() - atom.GetFormalCharge() + return [v - current_v for v in expected_vs] + + def _standardize_patterns(self, mol, max_iter=None): + """Standardizes functional groups + + Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, + and SMARTS reactions for other functional groups. + Due to the way reactions work, we first have to split the molecule by + fragments. Then, for each fragment, we apply the standardization reactions. + Finally, the fragments are recombined. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to standardize + max_iter : Optional[int] + Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. Maximum number + of iterations to standardize conjugated systems + + Returns + ------- + mol : rdkit.Chem.rdchem.Mol + The standardized molecule + + Notes + ----- + The following functional groups are transformed in this order: + + +---------------+------------------------------------------------------------------------------+ + | Name | Reaction | + +===============+==============================================================================+ + | conjugated | ``[*-:1]-[*:2]=[*:3]-[*-:4]>>[*+0:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+------------------------------------------------------------------------------+ + | conjugated N+ | ``[N;X3;v3:1]-[*:2]=[*:3]-[*-:4]>>[N+:1]=[*:2]-[*:3]=[*+0:4]`` | + +---------------+------------------------------------------------------------------------------+ + | conjugated O- | ``[O:1]=[#6+0,#7+:2]-[*:3]=[*:4]-[*-:5]>>[O-:1]-[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | conjug. S=O | ``[O-:1]-[S;D4;v4:2]-[*:3]=[*:4]-[*-:5]>>[O+0:1]=[*:2]=[*:3]-[*:4]=[*+0:5]`` | + +---------------+------------------------------------------------------------------------------+ + | Cterm | ``[C-;X2;H0:1]=[O:2]>>[C+0:1]=[O:2]`` | + +---------------+------------------------------------------------------------------------------+ + | Nterm | ``[N-;X2;H1;$(N-[*^3]):1]>>[N+0:1]`` | + +---------------+------------------------------------------------------------------------------+ + | keto-enolate | ``[#6-:1]-[#6:2]=[O:3]>>[#6+0:1]=[#6:2]-[O-:3]`` | + +---------------+------------------------------------------------------------------------------+ + | arginine | ``[C-;v3:1]-[#7+0;v3;H2:2]>>[#6+0:1]=[#7+:2]`` | + +---------------+------------------------------------------------------------------------------+ + | histidine | ``[#6+0;H0:1]=[#6+0:2]-[#7;X3:3]-[#6-;X3:4]>>[#6:1]=[#6:2]-[#7+:3]=[#6+0:4]``| + +---------------+------------------------------------------------------------------------------+ + | sulfone | ``[S;D4;!v6:1]-[*-:2]>>[S;v6:1]=[*+0:2]`` | + +---------------+------------------------------------------------------------------------------+ + | charged N | ``[#7+0;X3:1]-[*-:2]>>[#7+:1]=[*+0:2]`` | + +---------------+------------------------------------------------------------------------------+ + + """ + if max_iter is None: + max_iter = self.max_iter + else: + warnings.warn( + "Specifying `max_iter` is deprecated and will be removed in a future " + "update. Directly create an instance of `MDAnalysisInferer` with " + "`MDAnalysisInferer(max_iter=...)` instead.", + DeprecationWarning) + + # standardize conjugated systems + self._rebuild_conjugated_bonds(mol, max_iter) + + # list of sanitized reactions + reactions = [] + for rxn in self.STANDARDIZATION_REACTIONS: + reaction = AllChem.ReactionFromSmarts(rxn) + reactions.append(reaction) + + # fragment mol (reactions must have single reactant and product) + fragments = list(Chem.GetMolFrags(mol, asMols=True)) + for reactant in fragments: + self._apply_reactions(reactions, reactant) + + # recombine fragments + newmol = fragments.pop(0) + for fragment in fragments: + newmol = Chem.CombineMols(newmol, fragment) + + return newmol + + @staticmethod + def _apply_reactions(reactions, reactant): + """Applies a series of unimolecular reactions to a molecule. The reactions + must not add any atom to the product. The molecule is modified inplace. + + Parameters + ---------- + reactions : List[rdkit.Chem.rdChemReactions.ChemicalReaction] + Reactions from SMARTS. Each reaction is applied until no more + transformations can be made. + reactant : rdkit.Chem.rdchem.Mol + The molecule to transform inplace + + """ reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) - - -def _rebuild_conjugated_bonds(mol, max_iter=200): - """Rebuild conjugated bonds without negatively charged atoms at the - beginning and end of the conjugated system - - Depending on the order in which atoms are read during the conversion, the - :func:`_infer_bo_and_charges` function might write conjugated systems with - a double bond less and both edges of the system as anions instead of the - usual alternating single and double bonds. This function corrects this - behaviour by using an iterative procedure. - The problematic molecules always follow the same pattern: - ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number - of successive single and double bonds. The goal of the iterative procedure - is to make ``n`` as small as possible by consecutively transforming - ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with - ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to - ``*=*-*=*``. - Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can - control that we don't transform the same triplet of atoms back and forth by - adding their indices to a list. - This function also handles conjugated systems with triple bonds. - The molecule needs to be kekulized first to also cover systems - with aromatic rings. - - Parameters - ---------- - mol : rdkit.Chem.rdchem.RWMol - The molecule to transform, modified inplace - max_iter : int - Maximum number of iterations performed by the function - - Notes - ----- - The molecule is modified inplace - """ - mol.UpdatePropertyCache(strict=False) - Chem.Kekulize(mol) - # pattern used to find problematic conjugated bonds - # there's usually an even number of matches for this - pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") - # pattern used to finish fixing a series of conjugated bonds - base_end_pattern = Chem.MolFromSmarts( - "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") - # used when there's an odd number of matches for `pattern` - odd_end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," - "$([S;D4;v4]-[O-])]") - # number of unique matches with the pattern - n_matches = len(set([match[0] - for match in mol.GetSubstructMatches(pattern)])) - # nothing to standardize - if n_matches == 0: - return - # single match (unusual) - elif n_matches == 1: - # as a last resort, the only way to standardize is to find a nitrogen - # that can accept a double bond and a positive charge - # or a carbonyl that will become an enolate - end_pattern = odd_end_pattern - # at least 2 matches - else: - # give priority to base end pattern and then deal with the odd one if - # necessary - end_pattern = base_end_pattern - backtrack = [] - backtrack_cycles = 0 - for _ in range(max_iter): - # check for ending pattern - end_match = mol.GetSubstructMatch(end_pattern) - if end_match: - # index of each atom - anion1, a1, a2, anion2 = end_match - term_atom = mol.GetAtomWithIdx(anion2) - # edge-case 1: C-[O-] or [N+]-[O-] - # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] - # transform the =O to -[O-] - if ( - term_atom.GetAtomicNum() == 6 - and term_atom.GetFormalCharge() == 0 - ) or ( - term_atom.GetAtomicNum() == 7 - and term_atom.GetFormalCharge() == 1 - ): - for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - bond.GetBondTypeAsDouble() == 2): - bond.SetBondType(Chem.BondType.SINGLE) - neighbor.SetFormalCharge(-1) - break - # edge-case 2: S=O - # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O - # transform -[O-] to =O - elif (term_atom.GetAtomicNum() == 16 and - term_atom.GetFormalCharge() == 0): - for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - neighbor.GetFormalCharge() == -1 and - bond.GetBondTypeAsDouble() == 1): - bond.SetBondType(Chem.BondType.DOUBLE) - neighbor.SetFormalCharge(0) + for reaction in reactions: + while reaction.RunReactantInPlace(reactant): + reactant.UpdatePropertyCache(strict=False) + reactant.UpdatePropertyCache(strict=False) + Chem.Kekulize(reactant) + + def _rebuild_conjugated_bonds(self, mol, max_iter=None): + """Rebuild conjugated bonds without negatively charged atoms at the + beginning and end of the conjugated system + + Depending on the order in which atoms are read during the conversion, the + :func:`_infer_bo_and_charges` function might write conjugated systems with + a double bond less and both edges of the system as anions instead of the + usual alternating single and double bonds. This function corrects this + behaviour by using an iterative procedure. + The problematic molecules always follow the same pattern: + ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number + of successive single and double bonds. The goal of the iterative procedure + is to make ``n`` as small as possible by consecutively transforming + ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with + ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to + ``*=*-*=*``. + Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can + control that we don't transform the same triplet of atoms back and forth by + adding their indices to a list. + This function also handles conjugated systems with triple bonds. + The molecule needs to be kekulized first to also cover systems + with aromatic rings. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.RWMol + The molecule to transform, modified inplace + max_iter : Optional[int] + Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. Maximum number + of iterations to standardize conjugated systems + + Notes + ----- + The molecule is modified inplace + """ + max_iter = self.max_iter if max_iter is None else max_iter + mol.UpdatePropertyCache(strict=False) + Chem.Kekulize(mol) + # pattern used to find problematic conjugated bonds + # there's usually an even number of matches for this + pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") + # pattern used to finish fixing a series of conjugated bonds + base_end_pattern = Chem.MolFromSmarts( + "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") + # used when there's an odd number of matches for `pattern` + odd_end_pattern = Chem.MolFromSmarts( + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," + "$([S;D4;v4]-[O-])]") + # number of unique matches with the pattern + n_matches = len(set([match[0] + for match in mol.GetSubstructMatches(pattern)])) + # nothing to standardize + if n_matches == 0: + return + # single match (unusual) + elif n_matches == 1: + # as a last resort, the only way to standardize is to find a nitrogen + # that can accept a double bond and a positive charge + # or a carbonyl that will become an enolate + end_pattern = odd_end_pattern + # at least 2 matches + else: + # give priority to base end pattern and then deal with the odd one if + # necessary + end_pattern = base_end_pattern + backtrack = [] + backtrack_cycles = 0 + for _ in range(max_iter): + # check for ending pattern + end_match = mol.GetSubstructMatch(end_pattern) + if end_match: + # index of each atom + anion1, a1, a2, anion2 = end_match + term_atom = mol.GetAtomWithIdx(anion2) + # edge-case 1: C-[O-] or [N+]-[O-] + # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] + # transform the =O to -[O-] + if ( + term_atom.GetAtomicNum() == 6 + and term_atom.GetFormalCharge() == 0 + ) or ( + term_atom.GetAtomicNum() == 7 + and term_atom.GetFormalCharge() == 1 + ): + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if (neighbor.GetAtomicNum() == 8 and + bond.GetBondTypeAsDouble() == 2): + bond.SetBondType(Chem.BondType.SINGLE) + neighbor.SetFormalCharge(-1) + break + # edge-case 2: S=O + # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O + # transform -[O-] to =O + elif (term_atom.GetAtomicNum() == 16 and + term_atom.GetFormalCharge() == 0): + for neighbor in term_atom.GetNeighbors(): + bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + if (neighbor.GetAtomicNum() == 8 and + neighbor.GetFormalCharge() == -1 and + bond.GetBondTypeAsDouble() == 1): + bond.SetBondType(Chem.BondType.DOUBLE) + neighbor.SetFormalCharge(0) + break + # not an edge case: + # increment the charge + else: + term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) + # common to all cases: + # [*-]-*=*-[R] --> *=*-*=[R] + # increment the charge and switch single and double bonds + a = mol.GetAtomWithIdx(anion1) + a.SetFormalCharge(a.GetFormalCharge() + 1) + b = mol.GetBondBetweenAtoms(anion1, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + b = mol.GetBondBetweenAtoms(a2, anion2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + mol.UpdatePropertyCache(strict=False) + continue + + # switch the position of the charge and the double bond + matches = mol.GetSubstructMatches(pattern) + if matches: + # check if we haven't already transformed this triplet + for match in matches: + # store order-independent atom indices + g = set(match) + # already transformed --> try the next one + if g in backtrack: + continue + # add to backtracking and start the switch + else: + anion, a1, a2 = match + backtrack.append(g) break - # not an edge case: - # increment the charge - else: - term_atom.SetFormalCharge(term_atom.GetFormalCharge() + 1) - # common to all cases: - # [*-]-*=*-[R] --> *=*-*=[R] - # increment the charge and switch single and double bonds - a = mol.GetAtomWithIdx(anion1) - a.SetFormalCharge(a.GetFormalCharge() + 1) - b = mol.GetBondBetweenAtoms(anion1, a1) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - b = mol.GetBondBetweenAtoms(a1, a2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) - b = mol.GetBondBetweenAtoms(a2, anion2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - mol.UpdatePropertyCache(strict=False) - continue - - # switch the position of the charge and the double bond - matches = mol.GetSubstructMatches(pattern) - if matches: - # check if we haven't already transformed this triplet - for match in matches: - # store order-independent atom indices - g = set(match) - # already transformed --> try the next one - if g in backtrack: - continue - # add to backtracking and start the switch + # already performed all changes else: + if backtrack_cycles == 1: + # might be stuck because there were 2 "odd" end patterns + # misqualified as a single base one + end_pattern = odd_end_pattern + elif backtrack_cycles > 1: + # likely stuck changing the same bonds over and over so + # let's stop here + mol.UpdatePropertyCache(strict=False) + return + match = matches[0] anion, a1, a2 = match - backtrack.append(g) - break - # already performed all changes - else: - if backtrack_cycles == 1: - # might be stuck because there were 2 "odd" end patterns - # misqualified as a single base one + backtrack = [set(match)] + backtrack_cycles += 1 + + # switch charges + a = mol.GetAtomWithIdx(anion) + a.SetFormalCharge(a.GetFormalCharge() + 1) + a = mol.GetAtomWithIdx(a2) + a.SetFormalCharge(a.GetFormalCharge() - 1) + # switch bond orders + b = mol.GetBondBetweenAtoms(anion, a1) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) + b = mol.GetBondBetweenAtoms(a1, a2) + b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + mol.UpdatePropertyCache(strict=False) + # update number of matches for the end pattern + n_matches = len(set([match[0] for match in matches])) + if n_matches == 1: end_pattern = odd_end_pattern - elif backtrack_cycles > 1: - # likely stuck changing the same bonds over and over so - # let's stop here - mol.UpdatePropertyCache(strict=False) - return - match = matches[0] - anion, a1, a2 = match - backtrack = [set(match)] - backtrack_cycles += 1 - - # switch charges - a = mol.GetAtomWithIdx(anion) - a.SetFormalCharge(a.GetFormalCharge() + 1) - a = mol.GetAtomWithIdx(a2) - a.SetFormalCharge(a.GetFormalCharge() - 1) - # switch bond orders - b = mol.GetBondBetweenAtoms(anion, a1) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() + 1]) - b = mol.GetBondBetweenAtoms(a1, a2) - b.SetBondType(RDBONDORDER[b.GetBondTypeAsDouble() - 1]) + # start new iteration + continue + + # no more changes to apply mol.UpdatePropertyCache(strict=False) - # update number of matches for the end pattern - n_matches = len(set([match[0] for match in matches])) - if n_matches == 1: - end_pattern = odd_end_pattern - # start new iteration - continue - - # no more changes to apply - mol.UpdatePropertyCache(strict=False) - return + return - # reached max_iter - warnings.warn("The standardization could not be completed within a " - "reasonable number of iterations") + # reached max_iter + warnings.warn("The standardization could not be completed within a " + "reasonable number of iterations") diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 6139f962f9e..02989b7b6f5 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -39,8 +39,7 @@ _infer_bo_and_charges, _set_atom_property, _standardize_patterns) - from MDAnalysis.converters.RDKitInferring import ( - _atom_sorter, _rebuild_conjugated_bonds) + from MDAnalysis.converters.RDKitInferring import MDAnalysisInferer from rdkit import Chem from rdkit.Chem import AllChem @@ -72,7 +71,7 @@ def mol2_mol(): def smiles_mol(): mol = Chem.MolFromSmiles("CN1C=NC2=C1C(=O)N(C(=O)N2C)C") mol = Chem.AddHs(mol) - cids = AllChem.EmbedMultipleConfs(mol, numConfs=3) + AllChem.EmbedMultipleConfs(mol, numConfs=3) return mol @staticmethod @@ -172,7 +171,7 @@ def test_no_atoms_attr(self): def test_single_atom_mol(self, smi): u = mda.Universe.from_smiles(smi, addHs=False, generate_coordinates=False) - mol = u.atoms.convert_to.rdkit(NoImplicit=False) + mol = u.atoms.convert_to.rdkit(inferer=None) assert mol.GetNumAtoms() == 1 assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") @@ -195,7 +194,7 @@ def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) mda_atom = sel.atoms[atom_index] - umol = sel.convert_to.rdkit(NoImplicit=False) + umol = sel.convert_to.rdkit(inferer=None) atom = umol.GetAtomWithIdx(atom_index) mi = atom.GetMonomerInfo() assert mda_atom.altLoc == mi.GetAltLoc() @@ -245,7 +244,7 @@ def test_warn_guess_bonds(self): def test_bonds_outside_sel(self): u = mda.Universe(Chem.MolFromSmiles("CC(=O)C")) ag = u.select_atoms("index 1") - ag.convert_to.rdkit(NoImplicit=False) + ag.convert_to.rdkit(inferer=None) def test_error_no_hydrogen(self, uo2): with pytest.raises(AttributeError, @@ -256,12 +255,12 @@ def test_error_no_hydrogen(self, uo2): def test_error_no_hydrogen_implicit(self, uo2): with warnings.catch_warnings(): warnings.simplefilter("error") - uo2.atoms.convert_to.rdkit(NoImplicit=False) + uo2.atoms.convert_to.rdkit(inferer=None) def test_warning_no_hydrogen_force(self, uo2): with pytest.warns(UserWarning, match="Forcing to continue the conversion"): - uo2.atoms.convert_to.rdkit(NoImplicit=False, force=True) + uo2.atoms.convert_to.rdkit(inferer=None, force=True) @pytest.mark.parametrize("attr, value, expected", [ ("names", "N", " N "), @@ -302,13 +301,13 @@ def test_other_attributes(self, mol2, idx): ]) def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) - mol = ag.convert_to.rdkit(NoImplicit=False) + mol = ag.convert_to.rdkit(inferer=None) expected = [i for i in range(len(ag))] indices = [a.GetIntProp("_MDAnalysis_index") for a in mol.GetAtoms()] assert_equal(indices, expected) def test_assign_coordinates(self, pdb): - mol = pdb.atoms.convert_to.rdkit(NoImplicit=False) + mol = pdb.atoms.convert_to.rdkit(inferer=None) positions = mol.GetConformer().GetPositions() assert_allclose(positions, pdb.atoms.positions) @@ -393,11 +392,10 @@ def test_sanitize_fail_warning(self): mol.UpdatePropertyCache(strict=False) mol = Chem.AddHs(mol) u = mda.Universe(mol) - with pytest.warns(UserWarning, match="Could not sanitize molecule"): - u.atoms.convert_to.rdkit(NoImplicit=False) - with warnings.catch_warnings(): - warnings.filterwarnings("error", "Could not sanitize molecule") - u.atoms.convert_to.rdkit() + with pytest.warns() as record: + u.atoms.convert_to.rdkit(inferer=None) + assert "Could not sanitize molecule" not in "\n".join( + [str(r.message) for r in record]) @requires_rdkit @@ -656,9 +654,20 @@ def test_order_independant_issue_3339(self, smi): def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) + inferer = MDAnalysisInferer(max_iter=2) with pytest.warns(UserWarning, match="reasonable number of iterations"): - _rebuild_conjugated_bonds(mol, 2) + inferer._rebuild_conjugated_bonds(mol) + + def test_deprecation_warning_max_iter(self): + smi = "c1ccccc1" + mol = Chem.MolFromSmiles(smi) + inferer = MDAnalysisInferer() + with pytest.warns( + DeprecationWarning, + match="Specifying `max_iter` is deprecated and will be removed" + ): + inferer._standardize_patterns(mol, max_iter=1) @pytest.mark.parametrize("smi", [ "[Li+]", "[Na+]", "[K+]", "[Rb+]", "[Ag+]", "[Cs+]", @@ -718,6 +727,7 @@ def test_atom_sorter(self): # atom indices: 1 3 5 6 mol.UpdatePropertyCache() sorted_atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], key=_atom_sorter) + if atom.GetAtomicNum() > 1], + key=MDAnalysisInferer._atom_sorter) sorted_indices = [atom.GetIdx() for atom in sorted_atoms] assert sorted_indices == [6, 5, 1, 3] From 77dfd3a4383d8bd6d7e30a5b508ceb7896315784 Mon Sep 17 00:00:00 2001 From: Cedric Bouysset Date: Fri, 29 Sep 2023 16:40:31 +0100 Subject: [PATCH 04/14] add template based inferring --- package/MDAnalysis/converters/RDKitInferring.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index e51922e2766..ed56c55a81f 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -33,7 +33,7 @@ """ import warnings from dataclasses import dataclass -from typing import ClassVar, Dict, List +from typing import ClassVar, Dict, List, Any import numpy as np @@ -527,3 +527,18 @@ def _rebuild_conjugated_bonds(self, mol, max_iter=None): # reached max_iter warnings.warn("The standardization could not be completed within a " "reasonable number of iterations") + + +@dataclass(frozen=True) +class TemplateInferer: + """Infer bond orders and charges by matching the molecule with a template molecule containing bond orders and charges. + + Attributes + ---------- + template : Chem.Mol + Molecule containing the bond orders and charges. + """ + template: Any + + def __call__(self, mol): + return AllChem.AssignBondOrdersFromTemplate(self.template, mol) From f4eee15827864bc13a7ea8b8528801d851d09e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 14 Dec 2023 21:15:42 +0100 Subject: [PATCH 05/14] handle deprecate NoImplicit --- package/MDAnalysis/converters/RDKit.py | 90 +++++++++++++++++--------- 1 file changed, 60 insertions(+), 30 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 119ce055c92..3ad735cb7f8 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -84,25 +84,23 @@ import copy import warnings +from contextlib import suppress from functools import lru_cache from io import StringIO import numpy as np -from . import base from ..coordinates import memory from ..coordinates.PDB import PDBWriter from ..core.topologyattrs import _TOPOLOGY_ATTRS from ..exceptions import NoDataError +from . import base -try: +with suppress(ImportError): from rdkit import Chem - from .RDKitInferring import ( - RDBONDORDER, MDAnalysisInferer) -except ImportError: - pass -else: + from .RDKitInferring import RDBONDORDER, MDAnalysisInferer + RDATTRIBUTES = { "altLocs": "AltLoc", "chainIDs": "ChainId", @@ -249,8 +247,9 @@ class RDKitConverter(base.ConverterBase): guessed if not present. Hydrogens should be explicit in the topology file. If this is not the case, - use the parameter ``NoImplicit=False`` when using the converter to allow - implicit hydrogens and disable inferring bond orders and charges. + use the parameter ``implicit_hydrogens=True`` when using the converter to allow + implicit hydrogens, and ``inferer=None`` to disable inferring bond orders and + charges. Since one of the main use case of the converter is converting trajectories and not just a topology, creating a new molecule from scratch for every @@ -259,7 +258,7 @@ class RDKitConverter(base.ConverterBase): sensitive to the arguments that were passed to the converter. The number of objects cached can be changed with the function :func:`set_converter_cache_size`. However, ``ag.convert_to("RDKIT")`` - followed by ``ag.convert_to("RDKIT", NoImplicit=False)`` will not use the + followed by ``ag.convert_to("RDKIT", implicit_hydrogens=False)`` will not use the cache since the arguments given are different. You can pass a ``cache=False`` argument to the converter to bypass the caching system. @@ -292,8 +291,8 @@ class RDKitConverter(base.ConverterBase): lib = 'RDKIT' units = {'time': None, 'length': 'Angstrom'} - def convert(self, obj, cache=True, NoImplicit=True, max_iter=None, - force=False, inferer=DEFAULT_INFERER): + def convert(self, obj, cache=True, implicit_hydrogens=False, + force=False, inferer=DEFAULT_INFERER, **kwargs): """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. @@ -309,14 +308,18 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=None, inferer : Optional[Callable[[Chem.Mol], Chem.Mol]] A callable to infer bond orders and charges for the RDKit molecule created by the converter. If ``None``, inferring is skipped. - NoImplicit : bool - Prevent adding hydrogens to the molecule. - max_iter : int - Deprecated, use `MDAnalysisInferer(max_iter=...)` instead. Maximum number - of iterations to standardize conjugated systems. + implicit_hydrogens : bool + Whether to allow implicit hydrogens on the molecule or not. force : bool Force the conversion when no hydrogens were detected but - ``NoImplicit=True``. Useful for inorganic molecules mostly. + ``inferer`` is not ``None``. Useful for inorganic molecules mostly. + + .. versionchanged:: 2.7.0 + Deprecated ``max_iter`` (moved to the inferer class + :class:`~MDAnalysis.converters.RDKitInferring.MDAnalysisInferer`) and + deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added + ``inferer`` to specify a callable that can transform the molecule (this + operation is cached). """ try: @@ -333,19 +336,32 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=None, "please use a valid AtomGroup or Universe".format( type(obj))) from None - if max_iter is not None: + if (max_iter := kwargs.get("max_iter")) is not None: warnings.warn( "Using `max_iter` is deprecated, use `MDAnalysisInferer(max_iter=...)` " "instead", DeprecationWarning) if isinstance(inferer, MDAnalysisInferer): inferer = MDAnalysisInferer(max_iter=max_iter) + + if (NoImplicit := kwargs.get("NoImplicit")) is not None: + warnings.warn( + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " + "To disable bond order and formal charge inferring, use " + "`inferer=None`", DeprecationWarning) + implicit_hydrogens = not NoImplicit + # backwards compatibility + if implicit_hydrogens: + inferer = None + # parameters passed to atomgroup_to_mol - kwargs = dict(NoImplicit=NoImplicit, force=force, inferer=inferer) + params = dict( + implicit_hydrogens=implicit_hydrogens, force=force, inferer=inferer + ) if cache: - mol = atomgroup_to_mol(ag, **kwargs) + mol = atomgroup_to_mol(ag, **params) mol = copy.deepcopy(mol) else: - mol = atomgroup_to_mol.__wrapped__(ag, **kwargs) + mol = atomgroup_to_mol.__wrapped__(ag, **params) # add a conformer for the current Timestep if hasattr(ag, "positions"): @@ -368,20 +384,21 @@ def convert(self, obj, cache=True, NoImplicit=True, max_iter=None, @lru_cache(maxsize=2) -def atomgroup_to_mol(ag, NoImplicit=True, force=False, inferer=DEFAULT_INFERER): +def atomgroup_to_mol( + ag, implicit_hydrogens=False, force=False, inferer=DEFAULT_INFERER, **kwargs +): """Converts an AtomGroup to an RDKit molecule without coordinates. Parameters ----------- ag : MDAnalysis.core.groups.AtomGroup The AtomGroup to convert - NoImplicit : bool - Prevent adding hydrogens to the molecule and allow bond orders and - formal charges to be guessed from the valence of each atom. + implicit_hydrogens : bool + Whether to allow implicit hydrogens on the molecule or not. force : bool - Force the conversion when no hydrogens were detected but - ``NoImplicit=True``. Mostly useful for inorganic molecules. - inferer : Optional[Callable[[Chem.Mol], Chem.Mol]] + Force the conversion when no hydrogens were detected but ``inferer`` is not + ``None``. Useful for inorganic molecules mostly. + inferer : Optional[Callable[[rdkit.Chem.rdchem.Mol], rdkit.Chem.rdchem.Mol]] A callable to infer bond orders and charges for the RDKit molecule created by the converter. If ``None``, inferring is skipped. """ @@ -408,6 +425,18 @@ def atomgroup_to_mol(ag, NoImplicit=True, force=False, inferer=DEFAULT_INFERER): "to disable inferring bond orders and charges. You can also use " "``force=True`` to ignore this error.") + if (NoImplicit := kwargs.pop("NoImplicit", None)) is not None: + warnings.warn( + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " + "To disable bond order and formal charge inferring, use " + "`inferer=None`", DeprecationWarning) + implicit_hydrogens = not NoImplicit + # backwards compatibility + if implicit_hydrogens: + inferer = None + if kwargs: + raise ValueError(f"Found unexpected arguments: {kwargs}.") + # attributes accepted in PDBResidueInfo object pdb_attrs = {} for attr in RDATTRIBUTES.keys(): @@ -427,6 +456,7 @@ def get_resname(idx): other_attrs[attr] = getattr(ag, attr) mol = Chem.RWMol() + NoImplicit = not implicit_hydrogens # map index in universe to index in mol atom_mapper = {} @@ -468,7 +498,7 @@ def get_resname(idx): mol.UpdatePropertyCache(strict=False) - if inferer is not None and NoImplicit: + if inferer is not None: # infer bond orders and formal charges mol = inferer(mol) From 6eb1ac65ab1e46233f0a21f34f2580246c575f2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 14 Dec 2023 21:16:19 +0100 Subject: [PATCH 06/14] fix tests and add missing TemplateInferer tests --- .../MDAnalysisTests/converters/test_rdkit.py | 89 +++++++++++++++---- 1 file changed, 71 insertions(+), 18 deletions(-) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 02989b7b6f5..61bd194e158 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -23,6 +23,7 @@ import copy import warnings +from contextlib import suppress from io import StringIO import MDAnalysis as mda @@ -33,18 +34,15 @@ from MDAnalysisTests.util import import_not_available from numpy.testing import assert_allclose, assert_equal -try: +with suppress(ImportError): from MDAnalysis.converters.RDKit import (RDATTRIBUTES, + atomgroup_to_mol, _add_mda_attr_to_rdkit, - _infer_bo_and_charges, - _set_atom_property, - _standardize_patterns) - from MDAnalysis.converters.RDKitInferring import MDAnalysisInferer + _set_atom_property) + from MDAnalysis.converters.RDKitInferring import MDAnalysisInferer, TemplateInferer from rdkit import Chem from rdkit.Chem import AllChem -except ImportError: - pass requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), @@ -252,7 +250,7 @@ def test_error_no_hydrogen(self, uo2): "explicit"): uo2.atoms.convert_to("RDKIT") - def test_error_no_hydrogen_implicit(self, uo2): + def test_error_no_hydrogen_skip_inferring(self, uo2): with warnings.catch_warnings(): warnings.simplefilter("error") uo2.atoms.convert_to.rdkit(inferer=None) @@ -260,7 +258,7 @@ def test_error_no_hydrogen_implicit(self, uo2): def test_warning_no_hydrogen_force(self, uo2): with pytest.warns(UserWarning, match="Forcing to continue the conversion"): - uo2.atoms.convert_to.rdkit(inferer=None, force=True) + uo2.atoms.convert_to.rdkit(force=True) @pytest.mark.parametrize("attr, value, expected", [ ("names", "N", " N "), @@ -379,7 +377,7 @@ def test_cache(self): assert cached_func.cache_info().maxsize == 3 # test (4): caching is sensitive to converter arguments previous_cache = cached_func.cache_info() - uc.atoms.convert_to.rdkit(NoImplicit=False) + uc.atoms.convert_to.rdkit(implicit_hydrogens=True) cache = cached_func.cache_info() assert cache.misses == previous_cache.misses + 1 # test (5): skip cache @@ -388,18 +386,46 @@ def test_cache(self): assert cache == new_cache def test_sanitize_fail_warning(self): - mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", sanitize=False) - mol.UpdatePropertyCache(strict=False) - mol = Chem.AddHs(mol) + params = Chem.SmilesParserParams() + params.removeHs = False + params.sanitize = False + mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", params) u = mda.Universe(mol) with pytest.warns() as record: u.atoms.convert_to.rdkit(inferer=None) assert "Could not sanitize molecule" not in "\n".join( [str(r.message) for r in record]) + def test_deprecation_maw_iter(self, mol2): + with pytest.warns(DeprecationWarning, match="Using `max_iter` is deprecated"): + mol2.atoms.convert_to.rdkit(max_iter=2) + + def test_deprecation_NoImplicit(self, mol2): + with pytest.warns(DeprecationWarning, match="Using `NoImplicit` is deprecated"): + mol2.atoms.convert_to.rdkit(NoImplicit=True) + + def test_deprecation_atomgroup_to_mol_NoImplicit(self, mol2): + with pytest.warns(DeprecationWarning, match="Using `NoImplicit` is deprecated"): + atomgroup_to_mol(mol2.atoms, NoImplicit=False) + + def test_atomgroup_to_mol_unexpected_kwargs(self, mol2): + with pytest.raises( + ValueError, match="Found unexpected arguments: {'foo': 'bar'}" + ): + atomgroup_to_mol(mol2.atoms, foo="bar") + + def test_custom_callable_inferer(self, mol2): + def potatoe(mol): + new = Chem.Mol(mol) + new.SetProp("_Name", "🥔") + return new + + mol = mol2.atoms.convert_to.rdkit(inferer=potatoe) + assert mol.GetProp("_Name") == "🥔" + @requires_rdkit -class TestRDKitFunctions(object): +class TestRDKitMDAnalysisInferer(object): def add_Hs_remove_bo_and_charges(self, mol): """Add hydrogens and remove bond orders and charges from a molecule""" @@ -427,8 +453,9 @@ def enumerate_reordered_mol(self, mol): def assign_bond_orders_and_charges(self, mol): """Returns a sanitized molecule with infered bond orders and charges""" - _infer_bo_and_charges(mol) - mol = _standardize_patterns(mol) + inferer = MDAnalysisInferer() + inferer._infer_bo_and_charges(mol) + mol = inferer._standardize_patterns(mol) Chem.SanitizeMol(mol) return mol @@ -461,7 +488,8 @@ def assert_isomorphic_resonance_structure(self, mol, ref): def test_infer_bond_orders(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol) + inferer = MDAnalysisInferer() + inferer._infer_bo_and_charges(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) @@ -476,7 +504,8 @@ def test_infer_bond_orders(self, smi, out): def test_infer_charges(self, smi, atom_idx, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - _infer_bo_and_charges(mol) + inferer = MDAnalysisInferer() + inferer._infer_bo_and_charges(mol) Chem.SanitizeMol(mol) assert mol.GetAtomWithIdx(atom_idx).GetFormalCharge() == charge @@ -731,3 +760,27 @@ def test_atom_sorter(self): key=MDAnalysisInferer._atom_sorter) sorted_indices = [atom.GetIdx() for atom in sorted_atoms] assert sorted_indices == [6, 5, 1, 3] + + +class TestRDKitTemplateInferer: + @pytest.fixture(scope="function") + def template(self): + params = Chem.SmilesParserParams() + params.removeHs = False + return Chem.MolFromSmiles("[H]-[N+](-[H])(-[H])-[H]", params) + + def test_template_inferring(self, template): + params = Chem.SmilesParserParams() + params.removeHs = False + params.sanitize = False + mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", params) + assert mol.GetAtomWithIdx(1).GetFormalCharge() == 0 + + inferer = TemplateInferer(template=template) + mol = inferer(mol) + assert mol.GetAtomWithIdx(1).GetFormalCharge() == 1 + + def test_from_convert_to_method(self, template): + u = mda.Universe(template) + mol = u.atoms.convert_to.rdkit(inferer=TemplateInferer(template=template)) + assert mol.GetAtomWithIdx(1).GetFormalCharge() == 1 From 81e5e8185c3b52b9e444bde2b3263bba56db1f41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Thu, 14 Dec 2023 22:03:07 +0100 Subject: [PATCH 07/14] formatting and typing --- package/MDAnalysis/converters/RDKit.py | 82 ++- .../MDAnalysis/converters/RDKitInferring.py | 174 +++--- .../MDAnalysisTests/converters/test_rdkit.py | 557 ++++++++++-------- 3 files changed, 460 insertions(+), 353 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 3ad735cb7f8..1962a3cc897 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -96,6 +96,7 @@ from ..exceptions import NoDataError from . import base +DEFAULT_INFERER = None with suppress(ImportError): from rdkit import Chem @@ -125,10 +126,11 @@ class RDKitReader(memory.MemoryReader): .. versionadded:: 2.0.0 """ - format = 'RDKIT' + + format = "RDKIT" # Structure.coordinates always in Angstrom - units = {'time': None, 'length': 'Angstrom'} + units = {"time": None, "length": "Angstrom"} @staticmethod def _format_hint(thing): @@ -152,14 +154,14 @@ def __init__(self, filename, **kwargs): RDKit molecule """ n_atoms = filename.GetNumAtoms() - coordinates = np.array([ - conf.GetPositions() for conf in filename.GetConformers()], - dtype=np.float32) + coordinates = np.array( + [conf.GetPositions() for conf in filename.GetConformers()], dtype=np.float32 + ) if coordinates.size == 0: warnings.warn("No coordinates found in the RDKit molecule") coordinates = np.empty((1, n_atoms, 3), dtype=np.float32) coordinates[:] = np.nan - super(RDKitReader, self).__init__(coordinates, order='fac', **kwargs) + super(RDKitReader, self).__init__(coordinates, order="fac", **kwargs) class RDKitConverter(base.ConverterBase): @@ -288,11 +290,18 @@ class RDKitConverter(base.ConverterBase): """ - lib = 'RDKIT' - units = {'time': None, 'length': 'Angstrom'} - - def convert(self, obj, cache=True, implicit_hydrogens=False, - force=False, inferer=DEFAULT_INFERER, **kwargs): + lib = "RDKIT" + units = {"time": None, "length": "Angstrom"} + + def convert( + self, + obj, + cache=True, + implicit_hydrogens=False, + force=False, + inferer=DEFAULT_INFERER, + **kwargs, + ): """Write selection at current trajectory frame to :class:`~rdkit.Chem.rdchem.Mol`. @@ -325,21 +334,26 @@ def convert(self, obj, cache=True, implicit_hydrogens=False, try: from rdkit import Chem except ImportError: - raise ImportError("RDKit is required for the RDKitConverter but " - "it's not installed. Try installing it with \n" - "conda install -c conda-forge rdkit") + raise ImportError( + "RDKit is required for the RDKitConverter but " + "it's not installed. Try installing it with \n" + "conda install -c conda-forge rdkit" + ) try: # make sure to use atoms (Issue 46) ag = obj.atoms except AttributeError: - raise TypeError("No `atoms` attribute in object of type {}, " - "please use a valid AtomGroup or Universe".format( - type(obj))) from None + raise TypeError( + "No `atoms` attribute in object of type {}, " + "please use a valid AtomGroup or Universe".format(type(obj)) + ) from None if (max_iter := kwargs.get("max_iter")) is not None: warnings.warn( "Using `max_iter` is deprecated, use `MDAnalysisInferer(max_iter=...)` " - "instead", DeprecationWarning) + "instead", + DeprecationWarning, + ) if isinstance(inferer, MDAnalysisInferer): inferer = MDAnalysisInferer(max_iter=max_iter) @@ -347,7 +361,9 @@ def convert(self, obj, cache=True, implicit_hydrogens=False, warnings.warn( "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " "To disable bond order and formal charge inferring, use " - "`inferer=None`", DeprecationWarning) + "`inferer=None`", + DeprecationWarning, + ) implicit_hydrogens = not NoImplicit # backwards compatibility if implicit_hydrogens: @@ -366,8 +382,10 @@ def convert(self, obj, cache=True, implicit_hydrogens=False, # add a conformer for the current Timestep if hasattr(ag, "positions"): if np.isnan(ag.positions).any(): - warnings.warn("NaN detected in coordinates, the output " - "molecule will not have 3D coordinates assigned") + warnings.warn( + "NaN detected in coordinates, the output " + "molecule will not have 3D coordinates assigned" + ) else: # assign coordinates conf = Chem.Conformer(mol.GetNumAtoms()) @@ -409,7 +427,8 @@ def atomgroup_to_mol( "The `elements` attribute is required for the RDKitConverter " "but is not present in this AtomGroup. Please refer to the " "documentation to guess elements from other attributes or " - "type `help(MDAnalysis.topology.guessers)`") from None + "type `help(MDAnalysis.topology.guessers)`" + ) from None if "H" not in ag.elements: if force: @@ -423,13 +442,16 @@ def atomgroup_to_mol( "converter requires all hydrogens to be explicit. You can use " "the parameter ``inferer=None`` when using the converter " "to disable inferring bond orders and charges. You can also use " - "``force=True`` to ignore this error.") + "``force=True`` to ignore this error." + ) if (NoImplicit := kwargs.pop("NoImplicit", None)) is not None: warnings.warn( - "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " - "To disable bond order and formal charge inferring, use " - "`inferer=None`", DeprecationWarning) + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " + "To disable bond order and formal charge inferring, use " + "`inferer=None`", + DeprecationWarning, + ) implicit_hydrogens = not NoImplicit # backwards compatibility if implicit_hydrogens: @@ -444,9 +466,12 @@ def atomgroup_to_mol( pdb_attrs[attr] = getattr(ag, attr) resnames = pdb_attrs.get("resnames", None) if resnames is None: + def get_resname(idx): return "" + else: + def get_resname(idx): return resnames[idx] @@ -485,7 +510,8 @@ def get_resname(idx): except NoDataError: warnings.warn( "No `bonds` attribute in this AtomGroup. Guessing bonds based " - "on atoms coordinates") + "on atoms coordinates" + ) ag.guess_bonds() for bond in ag.bonds: @@ -515,7 +541,7 @@ def set_converter_cache_size(maxsize): conversions in memory. Using ``maxsize=None`` will remove all limits to the cache size, i.e. everything is cached. """ - global atomgroup_to_mol # pylint: disable=global-statement + global atomgroup_to_mol # pylint: disable=global-statement atomgroup_to_mol = lru_cache(maxsize=maxsize)(atomgroup_to_mol.__wrapped__) diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index ed56c55a81f..81204e4d9d6 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -32,17 +32,17 @@ :members: """ import warnings +from contextlib import suppress from dataclasses import dataclass -from typing import ClassVar, Dict, List, Any +from typing import ClassVar, Dict, List, Optional, Tuple import numpy as np -try: +with suppress(ImportError): from rdkit import Chem - from rdkit.Chem import AllChem -except ImportError: - pass -else: + from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate + from rdkit.Chem.rdChemReactions import ChemicalReaction, ReactionFromSmarts + RDBONDORDER = { 1: Chem.BondType.SINGLE, 1.5: Chem.BondType.AROMATIC, @@ -55,7 +55,7 @@ PERIODIC_TABLE = Chem.GetPeriodicTable() -@dataclass(kw_only=True, frozen=True) +@dataclass(frozen=True) class MDAnalysisInferer: """Bond order and formal charge inferring as originally implemented for the RDKit converter. @@ -64,20 +64,32 @@ class MDAnalysisInferer: ---------- max_iter : int Maximum number of iterations to standardize conjugated systems. - See :meth:`_rebuild_conjugated_bonds` + See :meth:`~MDAnalysisInferer._rebuild_conjugated_bonds` MONATOMIC_CATION_CHARGES : ClassVar[Dict[int, int]] Charges that should be assigned to monatomic cations. Maps atomic number to their formal charge. Anion charges are directly handled by the code using the typical valence of the atom. STANDARDIZATION_REACTIONS : ClassVar[List[str]] - Reactions uses by :meth:`_standardize_patterns` to fix challenging cases must - have single reactant and product, and cannot add any atom + Reactions uses by :meth:`~MDAnalysisInferer._standardize_patterns` to fix + challenging cases must have single reactant and product, and cannot add any + atom. .. versionadded:: 2.7.0 """ + MONATOMIC_CATION_CHARGES: ClassVar[Dict[int, int]] = { - 3: 1, 11: 1, 19: 1, 37: 1, 47: 1, 55: 1, - 12: 2, 20: 2, 29: 2, 30: 2, 38: 2, 56: 2, + 3: 1, + 11: 1, + 19: 1, + 37: 1, + 47: 1, + 55: 1, + 12: 2, + 20: 2, + 29: 2, + 30: 2, + 38: 2, + 56: 2, 26: 2, # Fe could also be +3 13: 3, } @@ -94,7 +106,7 @@ class MDAnalysisInferer: max_iter: int = 200 - def __call__(self, mol): + def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": """Infer bond orders and formal charges in the molecule. Parameters @@ -113,38 +125,38 @@ def __call__(self, mol): # sanitize if possible err = Chem.SanitizeMol(mol, catchErrors=True) if err: - warnings.warn("Could not sanitize molecule: " - f"failed during step {err!r}") + warnings.warn(f"Could not sanitize molecule: failed during step {err!r}") return mol - def _reorder_atoms(self, mol): + def _reorder_atoms(self, mol: "Chem.Mol") -> "Chem.Mol": """Reorder atoms to match MDAnalysis, since the reactions from :meth:`_standardize_patterns` will mess up the original order. """ - try: - order = np.argsort([atom.GetIntProp("_MDAnalysis_index") - for atom in mol.GetAtoms()]) - except KeyError: - # not a molecule converted by MDAnalysis - pass - else: - mol = Chem.RenumberAtoms(mol, order.astype(int).tolist()) + with suppress(KeyError): + order = np.argsort( + [atom.GetIntProp("_MDAnalysis_index") for atom in mol.GetAtoms()] + ) + return Chem.RenumberAtoms(mol, order.astype(int).tolist()) + # not a molecule converted by MDAnalysis return mol @classmethod - def _atom_sorter(cls, atom): + def _atom_sorter(cls, atom: "Chem.Atom") -> Tuple[int, int]: """Sorts atoms in the molecule in a way that makes it easy for the bond order and charge infering code to get the correct state on the first try. Currently sorts by number of unpaired electrons, then by number of heavy atom neighbors (i.e. atoms at the edge first).""" - num_heavy_neighbors = len([ - neighbor for neighbor in atom.GetNeighbors() - if neighbor.GetAtomicNum() > 1] + num_heavy_neighbors = len( + [ + neighbor + for neighbor in atom.GetNeighbors() + if neighbor.GetAtomicNum() > 1 + ] ) return (-cls._get_nb_unpaired_electrons(atom)[0], num_heavy_neighbors) @classmethod - def _infer_bo_and_charges(cls, mol): + def _infer_bo_and_charges(cls, mol: "Chem.Mol") -> None: """Infer bond orders and formal charges from a molecule. Since most MD topology files don't explicitly retain information on bond @@ -170,16 +182,19 @@ def _infer_bo_and_charges(cls, mol): """ # heavy atoms sorted by number of heavy atom neighbors (lower first) then # NUE (higher first) - atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], - key=cls._atom_sorter) + atoms = sorted( + [atom for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1], + key=cls._atom_sorter, + ) for atom in atoms: # monatomic ions if atom.GetDegree() == 0: - atom.SetFormalCharge(cls.MONATOMIC_CATION_CHARGES.get( - atom.GetAtomicNum(), - -cls._get_nb_unpaired_electrons(atom)[0])) + atom.SetFormalCharge( + cls.MONATOMIC_CATION_CHARGES.get( + atom.GetAtomicNum(), -cls._get_nb_unpaired_electrons(atom)[0] + ) + ) mol.UpdatePropertyCache(strict=False) continue # get NUE for each possible valence @@ -194,8 +209,11 @@ def _infer_bo_and_charges(cls, mol): if (len(nue) == 1) and (nue[0] <= 0): continue else: - neighbors = sorted(atom.GetNeighbors(), reverse=True, - key=lambda a: cls._get_nb_unpaired_electrons(a)[0]) + neighbors = sorted( + atom.GetNeighbors(), + reverse=True, + key=lambda a: cls._get_nb_unpaired_electrons(a)[0], + ) # check if one of the neighbors has a common NUE for na in neighbors: # get NUE for the neighbor @@ -203,13 +221,12 @@ def _infer_bo_and_charges(cls, mol): # smallest common NUE common_nue = min( min([i for i in nue if i >= 0], default=0), - min([i for i in na_nue if i >= 0], default=0) + min([i for i in na_nue if i >= 0], default=0), ) # a common NUE of 0 means we don't need to do anything if common_nue != 0: # increase bond order - bond = mol.GetBondBetweenAtoms( - atom.GetIdx(), na.GetIdx()) + bond = mol.GetBondBetweenAtoms(atom.GetIdx(), na.GetIdx()) order = common_nue + 1 bond.SetBondType(RDBONDORDER[order]) mol.UpdatePropertyCache(strict=False) @@ -227,7 +244,7 @@ def _infer_bo_and_charges(cls, mol): mol.UpdatePropertyCache(strict=False) @staticmethod - def _get_nb_unpaired_electrons(atom): + def _get_nb_unpaired_electrons(atom: "Chem.Atom") -> List[int]: """Calculate the number of unpaired electrons (NUE) of an atom Parameters @@ -237,14 +254,16 @@ def _get_nb_unpaired_electrons(atom): Returns ------- - nue : list + nue : List[int] The NUE for each possible valence of the atom """ expected_vs = PERIODIC_TABLE.GetValenceList(atom.GetAtomicNum()) current_v = atom.GetTotalValence() - atom.GetFormalCharge() return [v - current_v for v in expected_vs] - def _standardize_patterns(self, mol, max_iter=None): + def _standardize_patterns( + self, mol: "Chem.Mol", max_iter: Optional[int] = None + ) -> "Chem.Mol": """Standardizes functional groups Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, @@ -304,16 +323,14 @@ def _standardize_patterns(self, mol, max_iter=None): "Specifying `max_iter` is deprecated and will be removed in a future " "update. Directly create an instance of `MDAnalysisInferer` with " "`MDAnalysisInferer(max_iter=...)` instead.", - DeprecationWarning) + DeprecationWarning, + ) # standardize conjugated systems self._rebuild_conjugated_bonds(mol, max_iter) # list of sanitized reactions - reactions = [] - for rxn in self.STANDARDIZATION_REACTIONS: - reaction = AllChem.ReactionFromSmarts(rxn) - reactions.append(reaction) + reactions = [ReactionFromSmarts(rxn) for rxn in self.STANDARDIZATION_REACTIONS] # fragment mol (reactions must have single reactant and product) fragments = list(Chem.GetMolFrags(mol, asMols=True)) @@ -328,7 +345,9 @@ def _standardize_patterns(self, mol, max_iter=None): return newmol @staticmethod - def _apply_reactions(reactions, reactant): + def _apply_reactions( + reactions: List["ChemicalReaction"], reactant: "Chem.Mol" + ) -> None: """Applies a series of unimolecular reactions to a molecule. The reactions must not add any atom to the product. The molecule is modified inplace. @@ -349,7 +368,9 @@ def _apply_reactions(reactions, reactant): reactant.UpdatePropertyCache(strict=False) Chem.Kekulize(reactant) - def _rebuild_conjugated_bonds(self, mol, max_iter=None): + def _rebuild_conjugated_bonds( + self, mol: "Chem.Mol", max_iter: Optional[int] = None + ) -> None: """Rebuild conjugated bonds without negatively charged atoms at the beginning and end of the conjugated system @@ -391,15 +412,13 @@ def _rebuild_conjugated_bonds(self, mol, max_iter=None): # there's usually an even number of matches for this pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") # pattern used to finish fixing a series of conjugated bonds - base_end_pattern = Chem.MolFromSmarts( - "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") + base_end_pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") # used when there's an odd number of matches for `pattern` odd_end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," - "$([S;D4;v4]-[O-])]") + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O),$([S;D4;v4]-[O-])]" + ) # number of unique matches with the pattern - n_matches = len(set([match[0] - for match in mol.GetSubstructMatches(pattern)])) + n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) # nothing to standardize if n_matches == 0: return @@ -427,29 +446,32 @@ def _rebuild_conjugated_bonds(self, mol, max_iter=None): # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] # transform the =O to -[O-] if ( - term_atom.GetAtomicNum() == 6 - and term_atom.GetFormalCharge() == 0 + term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0 ) or ( - term_atom.GetAtomicNum() == 7 - and term_atom.GetFormalCharge() == 1 + term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 1 ): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - bond.GetBondTypeAsDouble() == 2): + if ( + neighbor.GetAtomicNum() == 8 + and bond.GetBondTypeAsDouble() == 2 + ): bond.SetBondType(Chem.BondType.SINGLE) neighbor.SetFormalCharge(-1) break # edge-case 2: S=O # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O # transform -[O-] to =O - elif (term_atom.GetAtomicNum() == 16 and - term_atom.GetFormalCharge() == 0): + elif ( + term_atom.GetAtomicNum() == 16 and term_atom.GetFormalCharge() == 0 + ): for neighbor in term_atom.GetNeighbors(): bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) - if (neighbor.GetAtomicNum() == 8 and - neighbor.GetFormalCharge() == -1 and - bond.GetBondTypeAsDouble() == 1): + if ( + neighbor.GetAtomicNum() == 8 + and neighbor.GetFormalCharge() == -1 + and bond.GetBondTypeAsDouble() == 1 + ): bond.SetBondType(Chem.BondType.DOUBLE) neighbor.SetFormalCharge(0) break @@ -525,20 +547,24 @@ def _rebuild_conjugated_bonds(self, mol, max_iter=None): return # reached max_iter - warnings.warn("The standardization could not be completed within a " - "reasonable number of iterations") + warnings.warn( + "The standardization could not be completed within a " + "reasonable number of iterations" + ) @dataclass(frozen=True) class TemplateInferer: - """Infer bond orders and charges by matching the molecule with a template molecule containing bond orders and charges. + """Infer bond orders and charges by matching the molecule with a template molecule + containing bond orders and charges. Attributes ---------- - template : Chem.Mol + template : rdkit.Chem.rdchem.Mol Molecule containing the bond orders and charges. """ - template: Any - def __call__(self, mol): - return AllChem.AssignBondOrdersFromTemplate(self.template, mol) + template: "Chem.Mol" + + def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": + return AssignBondOrdersFromTemplate(self.template, mol) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 61bd194e158..2ee58f7d7a8 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -35,27 +35,31 @@ from numpy.testing import assert_allclose, assert_equal with suppress(ImportError): - from MDAnalysis.converters.RDKit import (RDATTRIBUTES, - atomgroup_to_mol, - _add_mda_attr_to_rdkit, - _set_atom_property) + from MDAnalysis.converters.RDKit import ( + RDATTRIBUTES, + _add_mda_attr_to_rdkit, + _set_atom_property, + atomgroup_to_mol, + ) from MDAnalysis.converters.RDKitInferring import MDAnalysisInferer, TemplateInferer - from rdkit import Chem from rdkit.Chem import AllChem -requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), - reason="requires RDKit") +requires_rdkit = pytest.mark.skipif( + import_not_available("rdkit"), reason="requires RDKit" +) -@pytest.mark.skipif(not import_not_available("rdkit"), - reason="only for min dependencies build") +@pytest.mark.skipif( + not import_not_available("rdkit"), reason="only for min dependencies build" +) class TestRequiresRDKit(object): def test_converter_requires_rdkit(self): u = mda.Universe(PDB_full) - with pytest.raises(ImportError, - match="RDKit is required for the RDKitConverter"): + with pytest.raises( + ImportError, match="RDKit is required for the RDKitConverter" + ): u.atoms.convert_to("RDKIT") @@ -101,22 +105,27 @@ def product(request): def is_isomorphic(mol, ref, useChirality=False): - return (mol.HasSubstructMatch(ref, useChirality=useChirality) - and ref.HasSubstructMatch(mol, useChirality=useChirality)) + return mol.HasSubstructMatch( + ref, useChirality=useChirality + ) and ref.HasSubstructMatch(mol, useChirality=useChirality) @requires_rdkit class TestRDKitReader(object): - @pytest.mark.parametrize("rdmol, n_frames", [ - ("mol2_mol", 1), - ("smiles_mol", 3), - ], indirect=["rdmol"]) + @pytest.mark.parametrize( + "rdmol, n_frames", + [ + ("mol2_mol", 1), + ("smiles_mol", 3), + ], + indirect=["rdmol"], + ) def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames - expected = np.array([ - conf.GetPositions() for conf in rdmol.GetConformers()], - dtype=np.float32) + expected = np.array( + [conf.GetPositions() for conf in rdmol.GetConformers()], dtype=np.float32 + ) assert_equal(expected, universe.trajectory.coordinate_array) def test_no_coordinates(self): @@ -130,8 +139,7 @@ def test_compare_mol2reader(self): universe = mda.Universe(MolFactory.mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames - assert_equal(universe.trajectory.ts.positions, - mol2.trajectory.ts.positions) + assert_equal(universe.trajectory.ts.positions, mol2.trajectory.ts.positions) @requires_rdkit @@ -144,16 +152,17 @@ def pdb(self): def mol2(self): u = mda.Universe(mol2_molecule) # add elements - elements = np.array([guess_atom_element(x) for x in u.atoms.types], - dtype=object) - u.add_TopologyAttr('elements', elements) + elements = np.array( + [guess_atom_element(x) for x in u.atoms.types], dtype=object + ) + u.add_TopologyAttr("elements", elements) return u @pytest.fixture def peptide(self): u = mda.Universe(GRO) elements = mda.topology.guessers.guess_types(u.atoms.names) - u.add_TopologyAttr('elements', elements) + u.add_TopologyAttr("elements", elements) return u.select_atoms("resid 2-12") @pytest.fixture @@ -167,28 +176,33 @@ def test_no_atoms_attr(self): @pytest.mark.parametrize("smi", ["[H]", "C", "O", "[He]"]) def test_single_atom_mol(self, smi): - u = mda.Universe.from_smiles(smi, addHs=False, - generate_coordinates=False) + u = mda.Universe.from_smiles(smi, addHs=False, generate_coordinates=False) mol = u.atoms.convert_to.rdkit(inferer=None) assert mol.GetNumAtoms() == 1 assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") - @pytest.mark.parametrize("resname, n_atoms, n_fragments", [ - ("PRO", 14, 1), - ("ILE", 38, 1), - ("ALA", 20, 2), - ("GLY", 21, 3), - ]) + @pytest.mark.parametrize( + "resname, n_atoms, n_fragments", + [ + ("PRO", 14, 1), + ("ILE", 38, 1), + ("ALA", 20, 2), + ("GLY", 21, 3), + ], + ) def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): mol = peptide.select_atoms("resname %s" % resname).convert_to("RDKIT") assert n_atoms == mol.GetNumAtoms() assert n_fragments == len(Chem.GetMolFrags(mol)) - @pytest.mark.parametrize("sel_str, atom_index", [ - ("resid 1", 0), - ("resname LYS and name NZ", 1), - ("resid 34 and altloc B", 2), - ]) + @pytest.mark.parametrize( + "sel_str, atom_index", + [ + ("resid 1", 0), + ("resname LYS and name NZ", 1), + ("resid 34 and altloc B", 2), + ], + ) def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) mda_atom = sel.atoms[atom_index] @@ -205,8 +219,7 @@ def test_monomer_info(self, pdb, sel_str, atom_index): assert mda_atom.segindex == mi.GetSegmentNumber() assert mda_atom.tempfactor == mi.GetTempFactor() - @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], - indirect=True) + @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], indirect=True) def test_identical_topology(self, rdmol): u = mda.Universe(rdmol) umol = u.atoms.convert_to("RDKIT") @@ -215,28 +228,23 @@ def test_identical_topology(self, rdmol): assert_equal(u.atoms.bonds, u2.atoms.bonds) assert_equal(u.atoms.elements, u2.atoms.elements) assert_equal(u.atoms.names, u2.atoms.names) - assert_allclose( - u.atoms.positions, - u2.atoms.positions, - rtol=0, - atol=1e-7) + assert_allclose(u.atoms.positions, u2.atoms.positions, rtol=0, atol=1e-7) def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) # Delete topology attribute (PR #3069) - u.del_TopologyAttr('elements') + u.del_TopologyAttr("elements") with pytest.raises( AttributeError, - match="`elements` attribute is required for the RDKitConverter" + match="`elements` attribute is required for the RDKitConverter", ): u.atoms.convert_to("RDKIT") def test_warn_guess_bonds(self): u = mda.Universe(PDB_helix) - with pytest.warns(UserWarning, - match="No `bonds` attribute in this AtomGroup"): + with pytest.warns(UserWarning, match="No `bonds` attribute in this AtomGroup"): u.atoms.convert_to("RDKIT") def test_bonds_outside_sel(self): @@ -245,9 +253,10 @@ def test_bonds_outside_sel(self): ag.convert_to.rdkit(inferer=None) def test_error_no_hydrogen(self, uo2): - with pytest.raises(AttributeError, - match="the converter requires all hydrogens to be " - "explicit"): + with pytest.raises( + AttributeError, + match="the converter requires all hydrogens to be " "explicit", + ): uo2.atoms.convert_to("RDKIT") def test_error_no_hydrogen_skip_inferring(self, uo2): @@ -256,28 +265,30 @@ def test_error_no_hydrogen_skip_inferring(self, uo2): uo2.atoms.convert_to.rdkit(inferer=None) def test_warning_no_hydrogen_force(self, uo2): - with pytest.warns(UserWarning, - match="Forcing to continue the conversion"): + with pytest.warns(UserWarning, match="Forcing to continue the conversion"): uo2.atoms.convert_to.rdkit(force=True) - @pytest.mark.parametrize("attr, value, expected", [ - ("names", "N", " N "), - ("names", "CA", " CA "), - ("names", "CAT", " CAT"), - ("names", "N1", " N1 "), - ("names", "CE2", " CE2"), - ("names", "C12", " C12"), - ("names", "HD12", "HD12"), - ("names", "C123", "C123"), - ("altLocs", "A", "A"), - ("chainIDs", "B", "B"), - ("icodes", "C", "C"), - ("occupancies", 0.5, 0.5), - ("resnames", "LIG", "LIG"), - ("resids", 123, 123), - ("segindices", 1, 1), - ("tempfactors", 0.8, 0.8), - ]) + @pytest.mark.parametrize( + "attr, value, expected", + [ + ("names", "N", " N "), + ("names", "CA", " CA "), + ("names", "CAT", " CAT"), + ("names", "N1", " N1 "), + ("names", "CE2", " CE2"), + ("names", "C12", " C12"), + ("names", "HD12", "HD12"), + ("names", "C123", "C123"), + ("altLocs", "A", "A"), + ("chainIDs", "B", "B"), + ("icodes", "C", "C"), + ("occupancies", 0.5, 0.5), + ("resnames", "LIG", "LIG"), + ("resids", 123, 123), + ("segindices", 1, 1), + ("tempfactors", 0.8, 0.8), + ], + ) def test_add_mda_attr_to_rdkit(self, attr, value, expected): mi = Chem.AtomPDBResidueInfo() _add_mda_attr_to_rdkit(attr, value, mi) @@ -293,10 +304,13 @@ def test_other_attributes(self, mol2, idx): assert mda_atom.segid == rdatom.GetProp("_MDAnalysis_segid") assert mda_atom.type == rdatom.GetProp("_MDAnalysis_type") - @pytest.mark.parametrize("sel_str", [ - "resname ALA", - "resname PRO and segid A", - ]) + @pytest.mark.parametrize( + "sel_str", + [ + "resname ALA", + "resname PRO and segid A", + ], + ) def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) mol = ag.convert_to.rdkit(inferer=None) @@ -316,7 +330,8 @@ def test_assign_stereochemistry(self, mol2): def test_trajectory_coords(self): u = mda.Universe.from_smiles( - "CCO", numConfs=3, rdkit_kwargs=dict(randomSeed=42)) + "CCO", numConfs=3, rdkit_kwargs=dict(randomSeed=42) + ) for ts in u.trajectory: mol = u.atoms.convert_to("RDKIT") positions = mol.GetConformer().GetPositions() @@ -394,7 +409,8 @@ def test_sanitize_fail_warning(self): with pytest.warns() as record: u.atoms.convert_to.rdkit(inferer=None) assert "Could not sanitize molecule" not in "\n".join( - [str(r.message) for r in record]) + [str(r.message) for r in record] + ) def test_deprecation_maw_iter(self, mol2): with pytest.warns(DeprecationWarning, match="Using `max_iter` is deprecated"): @@ -426,7 +442,6 @@ def potatoe(mol): @requires_rdkit class TestRDKitMDAnalysisInferer(object): - def add_Hs_remove_bo_and_charges(self, mol): """Add hydrogens and remove bond orders and charges from a molecule""" mH = Chem.AddHs(mol) @@ -465,26 +480,30 @@ def assert_isomorphic_resonance_structure(self, mol, ref): """ isomorphic = mol.HasSubstructMatch(ref) if not isomorphic: - isomorphic = bool(Chem.ResonanceMolSupplier(mol) - .GetSubstructMatch(ref)) + isomorphic = bool(Chem.ResonanceMolSupplier(mol).GetSubstructMatch(ref)) assert isomorphic, f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" - @pytest.mark.parametrize("smi, out", [ - ("C(-[H])(-[H])(-[H])-[H]", "C"), - ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), - ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", - "c1ccccc1"), - ("C-[C](-[H])-[O]", "C(=O)C"), - ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), - ("[N]-[C]-[H]", "N#C"), - ("C-[C](-[O]-[H])-[O]", "CC(=O)O"), - ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", "P(O)(O)(O)=O"), - ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", "P([O-])(O)(O)=O"), - ("[P](-[O]-[H])(-[O])(-[O])-[O]", "P([O-])([O-])(O)=O"), - ("[P](-[O])(-[O])(-[O])-[O]", "P([O-])([O-])([O-])=O"), - ("[H]-[O]-[N]-[O]", "ON=O"), - ("[N]-[C]-[O]", "N#C[O-]"), - ]) + @pytest.mark.parametrize( + "smi, out", + [ + ("C(-[H])(-[H])(-[H])-[H]", "C"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), + ( + "[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", + "c1ccccc1", + ), + ("C-[C](-[H])-[O]", "C(=O)C"), + ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), + ("[N]-[C]-[H]", "N#C"), + ("C-[C](-[O]-[H])-[O]", "CC(=O)O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", "P(O)(O)(O)=O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", "P([O-])(O)(O)=O"), + ("[P](-[O]-[H])(-[O])(-[O])-[O]", "P([O-])([O-])(O)=O"), + ("[P](-[O])(-[O])(-[O])-[O]", "P([O-])([O-])([O-])=O"), + ("[H]-[O]-[N]-[O]", "ON=O"), + ("[N]-[C]-[O]", "N#C[O-]"), + ], + ) def test_infer_bond_orders(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) @@ -495,12 +514,15 @@ def test_infer_bond_orders(self, smi, out): molref = Chem.MolFromSmiles(out) assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) - @pytest.mark.parametrize("smi, atom_idx, charge", [ - ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), - ("[N]-[C]-[O]", 2, -1), - ("[N](-[H])(-[H])(-[H])-[H]", 0, 1), - ("C-[C](-[O])-[O]", 3, -1), - ]) + @pytest.mark.parametrize( + "smi, atom_idx, charge", + [ + ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), + ("[N]-[C]-[O]", 2, -1), + ("[N](-[H])(-[H])(-[H])-[H]", 0, 1), + ("C-[C](-[O])-[O]", 3, -1), + ], + ) def test_infer_charges(self, smi, atom_idx, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) @@ -509,25 +531,28 @@ def test_infer_charges(self, smi, atom_idx, charge): Chem.SanitizeMol(mol) assert mol.GetAtomWithIdx(atom_idx).GetFormalCharge() == charge - @pytest.mark.parametrize("smi, out", [ - ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), - ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), - ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), - ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", - "CNC(N)=[N+](-[H])-[H]"), - ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), - ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), - ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), - ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), - ("[C-](=O)-C", "[C](=O)-C"), - ("[H]-[N-]-C", "[H]-[N]-C"), - ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", - "[O-]c1ccccc1"), - ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", - "[O-]C1=CC=CC1=O"), - ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), - ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), - ]) + @pytest.mark.parametrize( + "smi, out", + [ + ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), + ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), + ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), + ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", "CNC(N)=[N+](-[H])-[H]"), + ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), + ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), + ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), + ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), + ("[C-](=O)-C", "[C](=O)-C"), + ("[H]-[N-]-C", "[H]-[N]-C"), + ( + "[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", + "[O-]c1ccccc1", + ), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", "[O-]C1=CC=CC1=O"), + ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), + ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), + ], + ) def test_standardize_patterns(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) @@ -536,21 +561,24 @@ def test_standardize_patterns(self, smi, out): molref = Chem.MolFromSmiles(out) assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) - @pytest.mark.parametrize("attr, value, getter", [ - ("index", 42, "GetIntProp"), - ("index", np.int8(42), "GetIntProp"), - ("index", np.int16(42), "GetIntProp"), - ("index", np.int32(42), "GetIntProp"), - ("index", np.int64(42), "GetIntProp"), - ("index", np.uint8(42), "GetIntProp"), - ("index", np.uint16(42), "GetIntProp"), - ("index", np.uint32(42), "GetIntProp"), - ("index", np.uint64(42), "GetIntProp"), - ("charge", 4.2, "GetDoubleProp"), - ("charge", np.float32(4.2), "GetDoubleProp"), - ("charge", np.float64(4.2), "GetDoubleProp"), - ("type", "C.3", "GetProp"), - ]) + @pytest.mark.parametrize( + "attr, value, getter", + [ + ("index", 42, "GetIntProp"), + ("index", np.int8(42), "GetIntProp"), + ("index", np.int16(42), "GetIntProp"), + ("index", np.int32(42), "GetIntProp"), + ("index", np.int64(42), "GetIntProp"), + ("index", np.uint8(42), "GetIntProp"), + ("index", np.uint16(42), "GetIntProp"), + ("index", np.uint32(42), "GetIntProp"), + ("index", np.uint64(42), "GetIntProp"), + ("charge", 4.2, "GetDoubleProp"), + ("charge", np.float32(4.2), "GetDoubleProp"), + ("charge", np.float64(4.2), "GetDoubleProp"), + ("type", "C.3", "GetProp"), + ], + ) def test_set_atom_property(self, attr, value, getter): atom = Chem.Atom(1) prop = "_MDAnalysis_%s" % attr @@ -562,11 +590,14 @@ def test_ignore_prop(self): _set_atom_property(atom, "foo", {"bar": "baz"}) assert "foo" not in atom.GetPropsAsDict().items() - @pytest.mark.parametrize("smi", [ - "c1ccc(cc1)-c1ccccc1-c1ccccc1", - "c1cc[nH]c1", - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - ]) + @pytest.mark.parametrize( + "smi", + [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + ], + ) def test_transfer_properties(self, smi): mol = Chem.MolFromSmiles(smi) mol = self.add_Hs_remove_bo_and_charges(mol) @@ -580,86 +611,88 @@ def test_transfer_properties(self, smi): new = {} for a in newmol.GetAtoms(): ix = a.GetIntProp("_MDAnalysis_index") - new[ix] = {"_MDAnalysis_index": ix, - "dummy": a.GetProp("dummy")} + new[ix] = {"_MDAnalysis_index": ix, "dummy": a.GetProp("dummy")} props = a.GetPropsAsDict().keys() assert "old_mapno" not in props assert "react_atom_idx" not in props assert new == old - @pytest.mark.parametrize("smi", [ - "c1ccc(cc1)-c1ccccc1-c1ccccc1", - "c1cc[nH]c1", - "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", - "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", - "c1csc(c1)-c1ccoc1-c1cc[nH]c1", - "C1=C2C(=NC=N1)N=CN2", - "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", - "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", - "C=CC=CC=CC=CC=CC=C", - "NCCCCC([NH3+])C(=O)[O-]", - "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", - "C#CC=C", - # HID HIE HIP residues, see PR #2941 - "O=C([C@H](CC1=CNC=N1)N)O", - "O=C([C@H](CC1=CN=CN1)N)O", - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - # fixes from PR #3044 - "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", - "[O-][n+]1cccnc1", - "C[n+]1ccccc1", - "[PH4+]", - "c1nc[nH]n1", - "CC(=O)C=C(C)N", - "CC(C)=CC=C[O-]", - "O=S(C)(C)=NC", - "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", - "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", - "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", - "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", - "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", - "[O-][n+]1onc2ccccc21", - "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1", - "[O-]c1ccccc1", - "[O-]C=CC=CCC=CC=[N+](C)C", - "C=[N+](-[O-])-C", - "C-[N-]-C(=O)C", - # amino acids - "C[C@H](N)C(=O)O", # A - "NCC(=O)O", # G - "CC[C@H](C)[C@H](N)C(=O)O", # I - "CC(C)C[C@H](N)C(=O)O", # L - "O=C(O)[C@@H]1CCCN1", # P - "CC(C)[C@H](N)C(=O)O", # V - "N[C@@H](Cc1ccccc1)C(=O)O", # F - "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W - "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y - "N[C@@H](CC(=O)O)C(=O)O", # D - "N[C@@H](CCC(=O)O)C(=O)O", # E - "N=C(N)NCCC[C@H](N)C(=O)O", # R - "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H - "NCCCC[C@H](N)C(=O)O", # K - "N[C@@H](CO)C(=O)O", # S - "C[C@@H](O)[C@H](N)C(=O)O", # T - "N[C@@H](CS)C(=O)O", # C - "CSCC[C@H](N)C(=O)O", # M - "NC(=O)C[C@H](N)C(=O)O", # N - "NC(=O)CC[C@H](N)C(=O)O", # Q - # nucleic acids - "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A - "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T - "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U - "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C - "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G - # RDKitConverter benchmark 2022-05-23, fixed by better sorting - "CC(C)NS(C)(=O)=Nc1ccc(-c2ccc(-c3nc4[nH]c(O[C@@H]5CO[C@@H]6[C@H](O)CO[C@H]56)nc4cc3Cl)cc2)cc1 CHEMBL4111464", - "C[n+]1c2ccccc2c(C(=O)O)c2[nH]c3ccc(Br)cc3c21 CHEMBL511591", - "C[n+]1ccccc1-c1ccc(NC(=O)c2ccc(C(=O)Nc3ccc(-c4cccc[n+]4C)cc3)cc2)cc1 CHEMBL3230729", - "Cc1ccc(/C(O)=C(/C([S-])=NCc2ccccc2)[n+]2cccc(CO)c2)cc1[N+](=O)[O-] CHEMBL1596426", - "CC(C)(O)c1cc2c(cc1C(C)(C)O)-c1ccccc1-2 CHEMBL1990966", - "[O-]/N=C1/c2ccccc2-c2nc3ccc(C(F)(F)F)cc3nc21 CHEMBL4557878", - "N#Cc1c[nH]c(C(=O)Nc2ccc(-c3cccc[n+]3[O-])cc2C2=CCCCC2)n1 CHEMBL1172116", - ]) + @pytest.mark.parametrize( + "smi", + [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", + # HID HIE HIP residues, see PR #2941 + "O=C([C@H](CC1=CNC=N1)N)O", + "O=C([C@H](CC1=CN=CN1)N)O", + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + # fixes from PR #3044 + "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", + "[O-][n+]1cccnc1", + "C[n+]1ccccc1", + "[PH4+]", + "c1nc[nH]n1", + "CC(=O)C=C(C)N", + "CC(C)=CC=C[O-]", + "O=S(C)(C)=NC", + "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", + "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", + "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", + "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", + "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", + "[O-][n+]1onc2ccccc21", + "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1", + "[O-]c1ccccc1", + "[O-]C=CC=CCC=CC=[N+](C)C", + "C=[N+](-[O-])-C", + "C-[N-]-C(=O)C", + # amino acids + "C[C@H](N)C(=O)O", # A + "NCC(=O)O", # G + "CC[C@H](C)[C@H](N)C(=O)O", # I + "CC(C)C[C@H](N)C(=O)O", # L + "O=C(O)[C@@H]1CCCN1", # P + "CC(C)[C@H](N)C(=O)O", # V + "N[C@@H](Cc1ccccc1)C(=O)O", # F + "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W + "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y + "N[C@@H](CC(=O)O)C(=O)O", # D + "N[C@@H](CCC(=O)O)C(=O)O", # E + "N=C(N)NCCC[C@H](N)C(=O)O", # R + "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H + "NCCCC[C@H](N)C(=O)O", # K + "N[C@@H](CO)C(=O)O", # S + "C[C@@H](O)[C@H](N)C(=O)O", # T + "N[C@@H](CS)C(=O)O", # C + "CSCC[C@H](N)C(=O)O", # M + "NC(=O)C[C@H](N)C(=O)O", # N + "NC(=O)CC[C@H](N)C(=O)O", # Q + # nucleic acids + "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A + "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T + "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U + "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C + "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G + # RDKitConverter benchmark 2022-05-23, fixed by better sorting + "CC(C)NS(C)(=O)=Nc1ccc(-c2ccc(-c3nc4[nH]c(O[C@@H]5CO[C@@H]6[C@H](O)CO[C@H]56)nc4cc3Cl)cc2)cc1 CHEMBL4111464", + "C[n+]1c2ccccc2c(C(=O)O)c2[nH]c3ccc(Br)cc3c21 CHEMBL511591", + "C[n+]1ccccc1-c1ccc(NC(=O)c2ccc(C(=O)Nc3ccc(-c4cccc[n+]4C)cc3)cc2)cc1 CHEMBL3230729", + "Cc1ccc(/C(O)=C(/C([S-])=NCc2ccccc2)[n+]2cccc(CO)c2)cc1[N+](=O)[O-] CHEMBL1596426", + "CC(C)(O)c1cc2c(cc1C(C)(C)O)-c1ccccc1-2 CHEMBL1990966", + "[O-]/N=C1/c2ccccc2-c2nc3ccc(C(F)(F)F)cc3nc21 CHEMBL4557878", + "N#Cc1c[nH]c(C(=O)Nc2ccc(-c3cccc[n+]3[O-])cc2C2=CCCCC2)n1 CHEMBL1172116", + ], + ) def test_order_independant(self, smi): # generate mol with hydrogens but without bond orders ref = Chem.MolFromSmiles(smi) @@ -671,12 +704,15 @@ def test_order_independant(self, smi): self.assert_isomorphic_resonance_structure(m, ref) @pytest.mark.xfail(reason="Not currently tackled by the RDKitConverter") - @pytest.mark.parametrize("smi", [ - "C-[N+]#N", - "C-N=[N+]=[N-]", - "C-[O+]=C", - "C-[N+]#[C-]", - ]) + @pytest.mark.parametrize( + "smi", + [ + "C-[N+]#N", + "C-N=[N+]=[N-]", + "C-[O+]=C", + "C-[N+]#[C-]", + ], + ) def test_order_independant_issue_3339(self, smi): self.test_order_independant(smi) @@ -684,8 +720,7 @@ def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) inferer = MDAnalysisInferer(max_iter=2) - with pytest.warns(UserWarning, - match="reasonable number of iterations"): + with pytest.warns(UserWarning, match="reasonable number of iterations"): inferer._rebuild_conjugated_bonds(mol) def test_deprecation_warning_max_iter(self): @@ -694,28 +729,45 @@ def test_deprecation_warning_max_iter(self): inferer = MDAnalysisInferer() with pytest.warns( DeprecationWarning, - match="Specifying `max_iter` is deprecated and will be removed" + match="Specifying `max_iter` is deprecated and will be removed", ): inferer._standardize_patterns(mol, max_iter=1) - @pytest.mark.parametrize("smi", [ - "[Li+]", "[Na+]", "[K+]", "[Rb+]", "[Ag+]", "[Cs+]", - "[Mg+2]", "[Ca+2]", "[Cu+2]", "[Zn+2]", "[Sr+2]", "[Ba+2]", - "[Al+3]", "[Fe+2]", - "[Cl-]", - "[O-2]", - "[Na+].[Cl-]", - ]) + @pytest.mark.parametrize( + "smi", + [ + "[Li+]", + "[Na+]", + "[K+]", + "[Rb+]", + "[Ag+]", + "[Cs+]", + "[Mg+2]", + "[Ca+2]", + "[Cu+2]", + "[Zn+2]", + "[Sr+2]", + "[Ba+2]", + "[Al+3]", + "[Fe+2]", + "[Cl-]", + "[O-2]", + "[Na+].[Cl-]", + ], + ) def test_ions(self, smi): ref = Chem.MolFromSmiles(smi) stripped_mol = self.add_Hs_remove_bo_and_charges(ref) mol = self.assign_bond_orders_and_charges(stripped_mol) assert is_isomorphic(mol, ref) - @pytest.mark.parametrize("smi", [ - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - "O=S(C)(C)=NC", - ]) + @pytest.mark.parametrize( + "smi", + [ + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + "O=S(C)(C)=NC", + ], + ) def test_reorder_atoms(self, smi): mol = Chem.MolFromSmiles(smi) mol = Chem.AddHs(mol) @@ -733,15 +785,17 @@ def test_pdb_names(self): u = mda.Universe(PDB_helix) mol = u.atoms.convert_to.rdkit() names = u.atoms.names - rd_names = np.array([a.GetProp("_MDAnalysis_name") - for a in mol.GetAtoms()]) + rd_names = np.array([a.GetProp("_MDAnalysis_name") for a in mol.GetAtoms()]) assert (names == rd_names).all() - @pytest.mark.parametrize("smi", [ - r"F/C(Br)=C(Cl)/I", - r"F\C(Br)=C(Cl)\I", - "F-C(Br)=C(Cl)-I", - ]) + @pytest.mark.parametrize( + "smi", + [ + r"F/C(Br)=C(Cl)/I", + r"F\C(Br)=C(Cl)\I", + "F-C(Br)=C(Cl)-I", + ], + ) def test_bond_stereo_not_stereoany(self, smi): u = mda.Universe.from_smiles(smi) mol = u.atoms.convert_to.rdkit(force=True) @@ -750,18 +804,19 @@ def test_bond_stereo_not_stereoany(self, smi): assert bond.GetStereo() != Chem.BondStereo.STEREOANY def test_atom_sorter(self): - mol = Chem.MolFromSmiles( - "[H]-[C](-[H])-[C](-[H])-[C]-[C]-[H]", sanitize=False) + mol = Chem.MolFromSmiles("[H]-[C](-[H])-[C](-[H])-[C]-[C]-[H]", sanitize=False) # corresponding mol: C=C-C#C # atom indices: 1 3 5 6 mol.UpdatePropertyCache() - sorted_atoms = sorted([atom for atom in mol.GetAtoms() - if atom.GetAtomicNum() > 1], - key=MDAnalysisInferer._atom_sorter) + sorted_atoms = sorted( + [atom for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1], + key=MDAnalysisInferer._atom_sorter, + ) sorted_indices = [atom.GetIdx() for atom in sorted_atoms] assert sorted_indices == [6, 5, 1, 3] +@requires_rdkit class TestRDKitTemplateInferer: @pytest.fixture(scope="function") def template(self): From fd053d992343302d4e19a5619447efc94b15c366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 15 Dec 2023 20:24:22 +0100 Subject: [PATCH 08/14] remaining formating issues and docs --- package/MDAnalysis/converters/RDKit.py | 72 +++++--- .../MDAnalysis/converters/RDKitInferring.py | 170 ++++++++++-------- .../MDAnalysisTests/converters/test_rdkit.py | 92 +++++++--- 3 files changed, 213 insertions(+), 121 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 1962a3cc897..109016f9883 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -155,7 +155,8 @@ def __init__(self, filename, **kwargs): """ n_atoms = filename.GetNumAtoms() coordinates = np.array( - [conf.GetPositions() for conf in filename.GetConformers()], dtype=np.float32 + [conf.GetPositions() for conf in filename.GetConformers()], + dtype=np.float32, ) if coordinates.size == 0: warnings.warn("No coordinates found in the RDKit molecule") @@ -249,9 +250,18 @@ class RDKitConverter(base.ConverterBase): guessed if not present. Hydrogens should be explicit in the topology file. If this is not the case, - use the parameter ``implicit_hydrogens=True`` when using the converter to allow - implicit hydrogens, and ``inferer=None`` to disable inferring bond orders and - charges. + use the parameter ``implicit_hydrogens=True`` when using the converter to + allow implicit hydrogens, and ``inferer=None`` to disable inferring bond + orders and charges. You can also pass your own callable function to + ``inferer`` to assign bond orders and charges as you see fit:: + + >>> from rdkit import Chem + >>> from rdkit.Chem.AllChem import AssignBondOrdersFromTemplate + >>> template = Chem.MolFromSmiles("NC(Cc1ccccc1)C(=O)O") + >>> def infer_from_template(mol: Chem.Mol) -> Chem.Mol: + ... return AssignBondOrdersFromTemplate(template, mol) + >>> mol = u.atoms.convert_to.rdkit(inferer=infer_from_template) + Since one of the main use case of the converter is converting trajectories and not just a topology, creating a new molecule from scratch for every @@ -260,8 +270,8 @@ class RDKitConverter(base.ConverterBase): sensitive to the arguments that were passed to the converter. The number of objects cached can be changed with the function :func:`set_converter_cache_size`. However, ``ag.convert_to("RDKIT")`` - followed by ``ag.convert_to("RDKIT", implicit_hydrogens=False)`` will not use the - cache since the arguments given are different. You can pass a + followed by ``ag.convert_to("RDKIT", implicit_hydrogens=False)`` will not + use the cache since the arguments given are different. You can pass a ``cache=False`` argument to the converter to bypass the caching system. The ``_MDAnalysis_index`` property of the resulting molecule corresponds @@ -288,6 +298,13 @@ class RDKitConverter(base.ConverterBase): files while the original name is still available through ``atom.GetProp("_MDAnalysis_name")`` + .. versionchanged:: 2.7.0 + Deprecated ``max_iter`` (moved to the inferer class + :class:`~MDAnalysis.converters.RDKitInferring.MDAnalysisInferer`) and + deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added + ``inferer`` to specify a callable that can transform the molecule (this + operation is cached). + """ lib = "RDKIT" @@ -315,20 +332,13 @@ def convert( same AtomGroup selection and with the same arguments passed to the converter inferer : Optional[Callable[[Chem.Mol], Chem.Mol]] - A callable to infer bond orders and charges for the RDKit molecule created - by the converter. If ``None``, inferring is skipped. + A callable to infer bond orders and charges for the RDKit molecule + created by the converter. If ``None``, inferring is skipped. implicit_hydrogens : bool Whether to allow implicit hydrogens on the molecule or not. force : bool Force the conversion when no hydrogens were detected but ``inferer`` is not ``None``. Useful for inorganic molecules mostly. - - .. versionchanged:: 2.7.0 - Deprecated ``max_iter`` (moved to the inferer class - :class:`~MDAnalysis.converters.RDKitInferring.MDAnalysisInferer`) and - deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added - ``inferer`` to specify a callable that can transform the molecule (this - operation is cached). """ try: @@ -350,8 +360,8 @@ def convert( if (max_iter := kwargs.get("max_iter")) is not None: warnings.warn( - "Using `max_iter` is deprecated, use `MDAnalysisInferer(max_iter=...)` " - "instead", + "Using `max_iter` is deprecated, use `MDAnalysisInferer " + "(max_iter=...)` instead", DeprecationWarning, ) if isinstance(inferer, MDAnalysisInferer): @@ -359,9 +369,9 @@ def convert( if (NoImplicit := kwargs.get("NoImplicit")) is not None: warnings.warn( - "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " - "To disable bond order and formal charge inferring, use " - "`inferer=None`", + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` " + " instead. To disable bond order and formal charge inferring, " + "use `inferer=None`", DeprecationWarning, ) implicit_hydrogens = not NoImplicit @@ -403,7 +413,11 @@ def convert( @lru_cache(maxsize=2) def atomgroup_to_mol( - ag, implicit_hydrogens=False, force=False, inferer=DEFAULT_INFERER, **kwargs + ag, + implicit_hydrogens=False, + force=False, + inferer=DEFAULT_INFERER, + **kwargs, ): """Converts an AtomGroup to an RDKit molecule without coordinates. @@ -414,11 +428,11 @@ def atomgroup_to_mol( implicit_hydrogens : bool Whether to allow implicit hydrogens on the molecule or not. force : bool - Force the conversion when no hydrogens were detected but ``inferer`` is not - ``None``. Useful for inorganic molecules mostly. + Force the conversion when no hydrogens were detected but ``inferer`` + is not ``None``. Useful for inorganic molecules mostly. inferer : Optional[Callable[[rdkit.Chem.rdchem.Mol], rdkit.Chem.rdchem.Mol]] - A callable to infer bond orders and charges for the RDKit molecule created - by the converter. If ``None``, inferring is skipped. + A callable to infer bond orders and charges for the RDKit molecule + created by the converter. If ``None``, inferring is skipped. """ try: elements = ag.elements @@ -441,14 +455,14 @@ def atomgroup_to_mol( "No hydrogen atom could be found in the topology, but the " "converter requires all hydrogens to be explicit. You can use " "the parameter ``inferer=None`` when using the converter " - "to disable inferring bond orders and charges. You can also use " - "``force=True`` to ignore this error." + "to disable inferring bond orders and charges. You can also " + "use ``force=True`` to ignore this error." ) if (NoImplicit := kwargs.pop("NoImplicit", None)) is not None: warnings.warn( - "Using `NoImplicit` is deprecated, use `implicit_hydrogens` instead. " - "To disable bond order and formal charge inferring, use " + "Using `NoImplicit` is deprecated, use `implicit_hydrogens` " + "instead. To disable bond order and formal charge inferring, use " "`inferer=None`", DeprecationWarning, ) diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index 81204e4d9d6..aa59dfe460b 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -30,6 +30,10 @@ .. autoclass:: MDAnalysisInferer :members: + :private-members: + +.. autoclass:: TemplateInferer + :members: """ import warnings from contextlib import suppress @@ -57,8 +61,8 @@ @dataclass(frozen=True) class MDAnalysisInferer: - """Bond order and formal charge inferring as originally implemented for the RDKit - converter. + """Bond order and formal charge inferring as originally implemented for + the RDKit converter. Attributes ---------- @@ -66,13 +70,13 @@ class MDAnalysisInferer: Maximum number of iterations to standardize conjugated systems. See :meth:`~MDAnalysisInferer._rebuild_conjugated_bonds` MONATOMIC_CATION_CHARGES : ClassVar[Dict[int, int]] - Charges that should be assigned to monatomic cations. Maps atomic number to - their formal charge. Anion charges are directly handled by the code using the - typical valence of the atom. + Charges that should be assigned to monatomic cations. Maps atomic + number to their formal charge. Anion charges are directly handled by + the code using the typical valence of the atom. STANDARDIZATION_REACTIONS : ClassVar[List[str]] - Reactions uses by :meth:`~MDAnalysisInferer._standardize_patterns` to fix - challenging cases must have single reactant and product, and cannot add any - atom. + Reactions uses by :meth:`~MDAnalysisInferer._standardize_patterns` to + fix challenging cases must have single reactant and product, and + cannot add any atom. .. versionadded:: 2.7.0 """ @@ -116,8 +120,7 @@ def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": Returns ------- - A new molecule with proper bond orders and charges. The molecule needs to be - sanitized. + A new molecule with proper bond orders and charges. """ self._infer_bo_and_charges(mol) mol = self._standardize_patterns(mol) @@ -125,7 +128,9 @@ def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": # sanitize if possible err = Chem.SanitizeMol(mol, catchErrors=True) if err: - warnings.warn(f"Could not sanitize molecule: failed during step {err!r}") + warnings.warn( + f"Could not sanitize molecule: failed during step {err!r}" + ) return mol def _reorder_atoms(self, mol: "Chem.Mol") -> "Chem.Mol": @@ -134,7 +139,10 @@ def _reorder_atoms(self, mol: "Chem.Mol") -> "Chem.Mol": """ with suppress(KeyError): order = np.argsort( - [atom.GetIntProp("_MDAnalysis_index") for atom in mol.GetAtoms()] + [ + atom.GetIntProp("_MDAnalysis_index") + for atom in mol.GetAtoms() + ] ) return Chem.RenumberAtoms(mol, order.astype(int).tolist()) # not a molecule converted by MDAnalysis @@ -159,15 +167,15 @@ def _atom_sorter(cls, atom: "Chem.Atom") -> Tuple[int, int]: def _infer_bo_and_charges(cls, mol: "Chem.Mol") -> None: """Infer bond orders and formal charges from a molecule. - Since most MD topology files don't explicitly retain information on bond - orders or charges, it has to be guessed from the topology. This is done by - looping over each atom and comparing its expected valence to the current - valence to get the Number of Unpaired Electrons (NUE). - If an atom has a negative NUE, it needs a positive formal charge (-NUE). - If two neighbouring atoms have UEs, the bond between them most + Since most MD topology files don't explicitly retain information on + bond orders or charges, it has to be guessed from the topology. This + is done by looping over each atom and comparing its expected valence + to the current valence to get the Number of Unpaired Electrons (NUE). + If an atom has a negative NUE, it needs a positive formal charge + (-NUE). If two neighbouring atoms have UEs, the bond between them most likely has to be increased by the value of the smallest NUE. - If after this process, an atom still has UEs, it needs a negative formal - charge of -NUE. + If after this process, an atom still has UEs, it needs a negative + formal charge of -NUE. Parameters ---------- @@ -177,11 +185,12 @@ def _infer_bo_and_charges(cls, mol: "Chem.Mol") -> None: Notes ----- This algorithm is order dependant. For example, for a carboxylate group - R-C(-O)-O the first oxygen read will receive a double bond and the other - one will be charged. It will also affect more complex conjugated systems. + ``R-C(-O)-O`` the first oxygen read will receive a double bond and the + other one will be charged. It will also affect more complex conjugated + systems. """ - # heavy atoms sorted by number of heavy atom neighbors (lower first) then - # NUE (higher first) + # heavy atoms sorted by number of heavy atom neighbors (lower first) + # then NUE (higher first) atoms = sorted( [atom for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1], key=cls._atom_sorter, @@ -192,7 +201,8 @@ def _infer_bo_and_charges(cls, mol: "Chem.Mol") -> None: if atom.GetDegree() == 0: atom.SetFormalCharge( cls.MONATOMIC_CATION_CHARGES.get( - atom.GetAtomicNum(), -cls._get_nb_unpaired_electrons(atom)[0] + atom.GetAtomicNum(), + -cls._get_nb_unpaired_electrons(atom)[0], ) ) mol.UpdatePropertyCache(strict=False) @@ -226,7 +236,9 @@ def _infer_bo_and_charges(cls, mol: "Chem.Mol") -> None: # a common NUE of 0 means we don't need to do anything if common_nue != 0: # increase bond order - bond = mol.GetBondBetweenAtoms(atom.GetIdx(), na.GetIdx()) + bond = mol.GetBondBetweenAtoms( + atom.GetIdx(), na.GetIdx() + ) order = common_nue + 1 bond.SetBondType(RDBONDORDER[order]) mol.UpdatePropertyCache(strict=False) @@ -266,19 +278,20 @@ def _standardize_patterns( ) -> "Chem.Mol": """Standardizes functional groups - Uses :func:`_rebuild_conjugated_bonds` to standardize conjugated systems, - and SMARTS reactions for other functional groups. + Uses :meth:`~MDAnalysisInferer._rebuild_conjugated_bonds` to + standardize conjugated systems, and SMARTS reactions for other + functional groups. Due to the way reactions work, we first have to split the molecule by - fragments. Then, for each fragment, we apply the standardization reactions. - Finally, the fragments are recombined. + fragments. Then, for each fragment, we apply the standardization + reactions. Finally, the fragments are recombined. Parameters ---------- mol : rdkit.Chem.rdchem.RWMol The molecule to standardize max_iter : Optional[int] - Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. Maximum number - of iterations to standardize conjugated systems + Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. + Maximum number of iterations to standardize conjugated systems Returns ------- @@ -320,9 +333,10 @@ def _standardize_patterns( max_iter = self.max_iter else: warnings.warn( - "Specifying `max_iter` is deprecated and will be removed in a future " - "update. Directly create an instance of `MDAnalysisInferer` with " - "`MDAnalysisInferer(max_iter=...)` instead.", + "Specifying `max_iter` is deprecated and will be removed in a " + "future update. Directly create an instance of " + "`MDAnalysisInferer` with `MDAnalysisInferer(max_iter=...)` " + "instead.", DeprecationWarning, ) @@ -330,7 +344,9 @@ def _standardize_patterns( self._rebuild_conjugated_bonds(mol, max_iter) # list of sanitized reactions - reactions = [ReactionFromSmarts(rxn) for rxn in self.STANDARDIZATION_REACTIONS] + reactions = [ + ReactionFromSmarts(rxn) for rxn in self.STANDARDIZATION_REACTIONS + ] # fragment mol (reactions must have single reactant and product) fragments = list(Chem.GetMolFrags(mol, asMols=True)) @@ -348,8 +364,9 @@ def _standardize_patterns( def _apply_reactions( reactions: List["ChemicalReaction"], reactant: "Chem.Mol" ) -> None: - """Applies a series of unimolecular reactions to a molecule. The reactions - must not add any atom to the product. The molecule is modified inplace. + """Applies a series of unimolecular reactions to a molecule. The + reactions must not add any atom to the product. The molecule is + modified inplace. Parameters ---------- @@ -374,21 +391,22 @@ def _rebuild_conjugated_bonds( """Rebuild conjugated bonds without negatively charged atoms at the beginning and end of the conjugated system - Depending on the order in which atoms are read during the conversion, the - :func:`_infer_bo_and_charges` function might write conjugated systems with - a double bond less and both edges of the system as anions instead of the - usual alternating single and double bonds. This function corrects this - behaviour by using an iterative procedure. + Depending on the order in which atoms are read during the conversion, + the :meth:`~MDAnalysisInferer._infer_bo_and_charges` function might + write conjugated systems with a double bond less and both edges of the + system as anions instead of the usual alternating single and double + bonds. This function corrects this behaviour by using an iterative + procedure. The problematic molecules always follow the same pattern: - ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the number - of successive single and double bonds. The goal of the iterative procedure - is to make ``n`` as small as possible by consecutively transforming - ``anion-*=*`` into ``*=*-anion`` until it reaches the smallest pattern with - ``n=1``. This last pattern is then transformed from ``anion-*=*-anion`` to - ``*=*-*=*``. - Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, we can - control that we don't transform the same triplet of atoms back and forth by - adding their indices to a list. + ``anion[-*=*]n-anion`` instead of ``*=[*-*=]n*``, where ``n`` is the + number of successive single and double bonds. The goal of the + iterative procedure is to make ``n`` as small as possible by + consecutively transforming ``anion-*=*`` into ``*=*-anion`` until it + reaches the smallest pattern with ``n=1``. This last pattern is then + transformed ``anion-*=*-anion`` to ``*=*-*=*``. + Since ``anion-*=*`` is the same as ``*=*-anion`` in terms of SMARTS, + we can control that we don't transform the same triplet of atoms back + and forth by adding their indices to a list. This function also handles conjugated systems with triple bonds. The molecule needs to be kekulized first to also cover systems with aromatic rings. @@ -398,8 +416,8 @@ def _rebuild_conjugated_bonds( mol : rdkit.Chem.rdchem.RWMol The molecule to transform, modified inplace max_iter : Optional[int] - Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. Maximum number - of iterations to standardize conjugated systems + Deprecated, use ``MDAnalysisInferer(max_iter=...)`` instead. + Maximum number of iterations to standardize conjugated systems Notes ----- @@ -412,26 +430,31 @@ def _rebuild_conjugated_bonds( # there's usually an even number of matches for this pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]") # pattern used to finish fixing a series of conjugated bonds - base_end_pattern = Chem.MolFromSmarts("[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]") + base_end_pattern = Chem.MolFromSmarts( + "[*-{1-2}]-,=[*+0]=,#[*+0]-,=[*-{1-2}]" + ) # used when there's an odd number of matches for `pattern` odd_end_pattern = Chem.MolFromSmarts( - "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O),$([S;D4;v4]-[O-])]" + "[*-]-[*+0]=[*+0]-[*-,$([#7;X3;v3]),$([#6+0,#7+1]=O)," + "$([S;D4;v4]-[O-])]" ) # number of unique matches with the pattern - n_matches = len(set([match[0] for match in mol.GetSubstructMatches(pattern)])) + n_matches = len( + set([match[0] for match in mol.GetSubstructMatches(pattern)]) + ) # nothing to standardize if n_matches == 0: return # single match (unusual) elif n_matches == 1: - # as a last resort, the only way to standardize is to find a nitrogen - # that can accept a double bond and a positive charge + # as a last resort, the only way to standardize is to find a + # nitrogen that can accept a double bond and a positive charge # or a carbonyl that will become an enolate end_pattern = odd_end_pattern # at least 2 matches else: - # give priority to base end pattern and then deal with the odd one if - # necessary + # give priority to base end pattern and then deal with the odd one + # if necessary end_pattern = base_end_pattern backtrack = [] backtrack_cycles = 0 @@ -446,12 +469,16 @@ def _rebuild_conjugated_bonds( # [*-]-*=*-[C,N+]=O --> *=*-*=[C,N+]-[O-] # transform the =O to -[O-] if ( - term_atom.GetAtomicNum() == 6 and term_atom.GetFormalCharge() == 0 + term_atom.GetAtomicNum() == 6 + and term_atom.GetFormalCharge() == 0 ) or ( - term_atom.GetAtomicNum() == 7 and term_atom.GetFormalCharge() == 1 + term_atom.GetAtomicNum() == 7 + and term_atom.GetFormalCharge() == 1 ): for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + bond = mol.GetBondBetweenAtoms( + anion2, neighbor.GetIdx() + ) if ( neighbor.GetAtomicNum() == 8 and bond.GetBondTypeAsDouble() == 2 @@ -463,10 +490,13 @@ def _rebuild_conjugated_bonds( # [*-]-*=*-[Sv4]-[O-] --> *=*-*=[Sv6]=O # transform -[O-] to =O elif ( - term_atom.GetAtomicNum() == 16 and term_atom.GetFormalCharge() == 0 + term_atom.GetAtomicNum() == 16 + and term_atom.GetFormalCharge() == 0 ): for neighbor in term_atom.GetNeighbors(): - bond = mol.GetBondBetweenAtoms(anion2, neighbor.GetIdx()) + bond = mol.GetBondBetweenAtoms( + anion2, neighbor.GetIdx() + ) if ( neighbor.GetAtomicNum() == 8 and neighbor.GetFormalCharge() == -1 @@ -511,8 +541,8 @@ def _rebuild_conjugated_bonds( # already performed all changes else: if backtrack_cycles == 1: - # might be stuck because there were 2 "odd" end patterns - # misqualified as a single base one + # might be stuck because there were 2 "odd" end + # patterns misqualified as a single base one end_pattern = odd_end_pattern elif backtrack_cycles > 1: # likely stuck changing the same bonds over and over so @@ -555,8 +585,8 @@ def _rebuild_conjugated_bonds( @dataclass(frozen=True) class TemplateInferer: - """Infer bond orders and charges by matching the molecule with a template molecule - containing bond orders and charges. + """Infer bond orders and charges by matching the molecule with a template + molecule containing bond orders and charges. Attributes ---------- diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 2ee58f7d7a8..413f4587e3e 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -30,9 +30,15 @@ import numpy as np import pytest from MDAnalysis.topology.guessers import guess_atom_element -from MDAnalysisTests.datafiles import GRO, PDB_full, PDB_helix, mol2_molecule +from MDAnalysisTests.datafiles import ( + GRO, + TRR, + PDB_full, + PDB_helix, + mol2_molecule, +) from MDAnalysisTests.util import import_not_available -from numpy.testing import assert_allclose, assert_equal +from numpy.testing import assert_allclose, assert_array_equal, assert_equal with suppress(ImportError): from MDAnalysis.converters.RDKit import ( @@ -41,7 +47,10 @@ _set_atom_property, atomgroup_to_mol, ) - from MDAnalysis.converters.RDKitInferring import MDAnalysisInferer, TemplateInferer + from MDAnalysis.converters.RDKitInferring import ( + MDAnalysisInferer, + TemplateInferer, + ) from rdkit import Chem from rdkit.Chem import AllChem @@ -124,7 +133,8 @@ def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames expected = np.array( - [conf.GetPositions() for conf in rdmol.GetConformers()], dtype=np.float32 + [conf.GetPositions() for conf in rdmol.GetConformers()], + dtype=np.float32, ) assert_equal(expected, universe.trajectory.coordinate_array) @@ -139,7 +149,9 @@ def test_compare_mol2reader(self): universe = mda.Universe(MolFactory.mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames - assert_equal(universe.trajectory.ts.positions, mol2.trajectory.ts.positions) + assert_equal( + universe.trajectory.ts.positions, mol2.trajectory.ts.positions + ) @requires_rdkit @@ -176,7 +188,9 @@ def test_no_atoms_attr(self): @pytest.mark.parametrize("smi", ["[H]", "C", "O", "[He]"]) def test_single_atom_mol(self, smi): - u = mda.Universe.from_smiles(smi, addHs=False, generate_coordinates=False) + u = mda.Universe.from_smiles( + smi, addHs=False, generate_coordinates=False + ) mol = u.atoms.convert_to.rdkit(inferer=None) assert mol.GetNumAtoms() == 1 assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") @@ -219,7 +233,9 @@ def test_monomer_info(self, pdb, sel_str, atom_index): assert mda_atom.segindex == mi.GetSegmentNumber() assert mda_atom.tempfactor == mi.GetTempFactor() - @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], indirect=True) + @pytest.mark.parametrize( + "rdmol", ["mol2_mol", "smiles_mol"], indirect=True + ) def test_identical_topology(self, rdmol): u = mda.Universe(rdmol) umol = u.atoms.convert_to("RDKIT") @@ -228,7 +244,9 @@ def test_identical_topology(self, rdmol): assert_equal(u.atoms.bonds, u2.atoms.bonds) assert_equal(u.atoms.elements, u2.atoms.elements) assert_equal(u.atoms.names, u2.atoms.names) - assert_allclose(u.atoms.positions, u2.atoms.positions, rtol=0, atol=1e-7) + assert_allclose( + u.atoms.positions, u2.atoms.positions, rtol=0, atol=1e-7 + ) def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) @@ -244,7 +262,9 @@ def test_raise_requires_elements(self): def test_warn_guess_bonds(self): u = mda.Universe(PDB_helix) - with pytest.warns(UserWarning, match="No `bonds` attribute in this AtomGroup"): + with pytest.warns( + UserWarning, match="No `bonds` attribute in this AtomGroup" + ): u.atoms.convert_to("RDKIT") def test_bonds_outside_sel(self): @@ -265,7 +285,9 @@ def test_error_no_hydrogen_skip_inferring(self, uo2): uo2.atoms.convert_to.rdkit(inferer=None) def test_warning_no_hydrogen_force(self, uo2): - with pytest.warns(UserWarning, match="Forcing to continue the conversion"): + with pytest.warns( + UserWarning, match="Forcing to continue the conversion" + ): uo2.atoms.convert_to.rdkit(force=True) @pytest.mark.parametrize( @@ -413,15 +435,21 @@ def test_sanitize_fail_warning(self): ) def test_deprecation_maw_iter(self, mol2): - with pytest.warns(DeprecationWarning, match="Using `max_iter` is deprecated"): + with pytest.warns( + DeprecationWarning, match="Using `max_iter` is deprecated" + ): mol2.atoms.convert_to.rdkit(max_iter=2) def test_deprecation_NoImplicit(self, mol2): - with pytest.warns(DeprecationWarning, match="Using `NoImplicit` is deprecated"): + with pytest.warns( + DeprecationWarning, match="Using `NoImplicit` is deprecated" + ): mol2.atoms.convert_to.rdkit(NoImplicit=True) def test_deprecation_atomgroup_to_mol_NoImplicit(self, mol2): - with pytest.warns(DeprecationWarning, match="Using `NoImplicit` is deprecated"): + with pytest.warns( + DeprecationWarning, match="Using `NoImplicit` is deprecated" + ): atomgroup_to_mol(mol2.atoms, NoImplicit=False) def test_atomgroup_to_mol_unexpected_kwargs(self, mol2): @@ -480,8 +508,12 @@ def assert_isomorphic_resonance_structure(self, mol, ref): """ isomorphic = mol.HasSubstructMatch(ref) if not isomorphic: - isomorphic = bool(Chem.ResonanceMolSupplier(mol).GetSubstructMatch(ref)) - assert isomorphic, f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" + isomorphic = bool( + Chem.ResonanceMolSupplier(mol).GetSubstructMatch(ref) + ) + assert ( + isomorphic + ), f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" @pytest.mark.parametrize( "smi, out", @@ -512,7 +544,9 @@ def test_infer_bond_orders(self, smi, out): Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) + assert is_isomorphic(mol, molref), "{} != {}".format( + Chem.MolToSmiles(mol), out + ) @pytest.mark.parametrize( "smi, atom_idx, charge", @@ -537,7 +571,10 @@ def test_infer_charges(self, smi, atom_idx, charge): ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), - ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", "CNC(N)=[N+](-[H])-[H]"), + ( + "C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", + "CNC(N)=[N+](-[H])-[H]", + ), ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), @@ -548,7 +585,10 @@ def test_infer_charges(self, smi, atom_idx, charge): "[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", "[O-]c1ccccc1", ), - ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", "[O-]C1=CC=CC1=O"), + ( + "[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", + "[O-]C1=CC=CC1=O", + ), ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), ], @@ -559,7 +599,9 @@ def test_standardize_patterns(self, smi, out): mol = self.assign_bond_orders_and_charges(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert is_isomorphic(mol, molref), "{} != {}".format(Chem.MolToSmiles(mol), out) + assert is_isomorphic(mol, molref), "{} != {}".format( + Chem.MolToSmiles(mol), out + ) @pytest.mark.parametrize( "attr, value, getter", @@ -720,7 +762,9 @@ def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) inferer = MDAnalysisInferer(max_iter=2) - with pytest.warns(UserWarning, match="reasonable number of iterations"): + with pytest.warns( + UserWarning, match="reasonable number of iterations" + ): inferer._rebuild_conjugated_bonds(mol) def test_deprecation_warning_max_iter(self): @@ -785,7 +829,9 @@ def test_pdb_names(self): u = mda.Universe(PDB_helix) mol = u.atoms.convert_to.rdkit() names = u.atoms.names - rd_names = np.array([a.GetProp("_MDAnalysis_name") for a in mol.GetAtoms()]) + rd_names = np.array( + [a.GetProp("_MDAnalysis_name") for a in mol.GetAtoms()] + ) assert (names == rd_names).all() @pytest.mark.parametrize( @@ -804,7 +850,9 @@ def test_bond_stereo_not_stereoany(self, smi): assert bond.GetStereo() != Chem.BondStereo.STEREOANY def test_atom_sorter(self): - mol = Chem.MolFromSmiles("[H]-[C](-[H])-[C](-[H])-[C]-[C]-[H]", sanitize=False) + mol = Chem.MolFromSmiles( + "[H]-[C](-[H])-[C](-[H])-[C]-[C]-[H]", sanitize=False + ) # corresponding mol: C=C-C#C # atom indices: 1 3 5 6 mol.UpdatePropertyCache() From d352d9424473fc33e514530a920fd39388809293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 15 Dec 2023 20:25:34 +0100 Subject: [PATCH 09/14] add adjust_hydrogens to TemplateInferer --- .../MDAnalysis/converters/RDKitInferring.py | 108 ++++++++++++++---- .../MDAnalysisTests/converters/test_rdkit.py | 66 ++++++++--- 2 files changed, 138 insertions(+), 36 deletions(-) diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index aa59dfe460b..3930b9e4ff4 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -36,6 +36,7 @@ :members: """ import warnings +from collections import defaultdict from contextlib import suppress from dataclasses import dataclass from typing import ClassVar, Dict, List, Optional, Tuple @@ -59,6 +60,32 @@ PERIODIC_TABLE = Chem.GetPeriodicTable() +def reorder_atoms( + mol: "Chem.Mol", field: str = "_MDAnalysis_index" +) -> "Chem.Mol": + """Reorder atoms to match with the original MDAnalysis AtomGroup.""" + with suppress(KeyError): + order = np.argsort([atom.GetIntProp(field) for atom in mol.GetAtoms()]) + return Chem.RenumberAtoms(mol, order.astype(int).tolist()) + # not a molecule converted by MDAnalysis + warnings.warn( + f"{field} not available on the input mol atoms, skipping reordering " + "of atoms." + ) + return mol + + +def sanitize_mol(mol: "Chem.Mol") -> None: + """Tries to sanitize the molecule, and if an error is raised, logs it as a + warning. + """ + err = Chem.SanitizeMol(mol, catchErrors=True) + if err: + warnings.warn( + f"Could not sanitize molecule: failed during step {err!r}" + ) + + @dataclass(frozen=True) class MDAnalysisInferer: """Bond order and formal charge inferring as originally implemented for @@ -124,28 +151,8 @@ def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": """ self._infer_bo_and_charges(mol) mol = self._standardize_patterns(mol) - mol = self._reorder_atoms(mol) - # sanitize if possible - err = Chem.SanitizeMol(mol, catchErrors=True) - if err: - warnings.warn( - f"Could not sanitize molecule: failed during step {err!r}" - ) - return mol - - def _reorder_atoms(self, mol: "Chem.Mol") -> "Chem.Mol": - """Reorder atoms to match MDAnalysis, since the reactions from - :meth:`_standardize_patterns` will mess up the original order. - """ - with suppress(KeyError): - order = np.argsort( - [ - atom.GetIntProp("_MDAnalysis_index") - for atom in mol.GetAtoms() - ] - ) - return Chem.RenumberAtoms(mol, order.astype(int).tolist()) - # not a molecule converted by MDAnalysis + mol = reorder_atoms(mol) + sanitize_mol(mol) return mol @classmethod @@ -592,9 +599,66 @@ class TemplateInferer: ---------- template : rdkit.Chem.rdchem.Mol Molecule containing the bond orders and charges. + adjust_hydrogens: bool, default = True + If ``True``, temporarily removes hydrogens from the molecule to + assign bond orders and charges from the template, then adds them + back. Useful to avoid adding explicit hydrogens on the template which + can prevent RDKit from finding a match between the template and the + molecule. """ template: "Chem.Mol" + adjust_hydrogens: bool = True def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": + if self.adjust_hydrogens: + return self.assign_from_template_with_adjusted_hydrogens(mol) return AssignBondOrdersFromTemplate(self.template, mol) + + def assign_from_template_with_adjusted_hydrogens( + self, mol: "Chem.Mol", index_field: str = "_MDAnalysis_index" + ) -> "Chem.Mol": + """Temporarily removes hydrogens from the molecule before assigning + bond orders and charges from the template, then adds them back. + + Parameters + ---------- + mol : rdkit.Chem.rdchem.Mol + Molecule to assign bond orders and charges to. + index_field : str, default = "_MDAnalysis_index" + Atom property corresponding to a unique integer that can used to + put back the hydrogen atoms that were removed before matching, + and sort them back to the original order. Must be accessible + through ``atom.GetIntProp``. + """ + # remove explicit hydrogens and assign BO from template + mol_no_h = Chem.RemoveAllHs(mol, sanitize=False) + new = AssignBondOrdersFromTemplate(self.template, mol_no_h) + # mapping between AtomGroup index of atom bearing H --> H atom(s) + atoms_with_hydrogens = defaultdict(list) + for atom in mol.GetAtoms(): + if atom.GetAtomicNum() == 1: + atoms_with_hydrogens[ + atom.GetNeighbors()[0].GetIntProp(index_field) + ].append(atom) + # mapping between atom that should be bearing a H in RWMol and + # corresponding H(s) + reverse_mapping = {} + for atom in new.GetAtoms(): + if (idx := atom.GetIntProp(index_field)) in atoms_with_hydrogens: + reverse_mapping[atom.GetIdx()] = atoms_with_hydrogens[idx] + # let UpdatePropertyCache take care of recalculating the + # number of explicit hydrogens to prevent sanitization errors + atom.SetNumExplicitHs(0) + # add missing Hs + rwmol = Chem.RWMol(new) + for atom_idx, hydrogens in reverse_mapping.items(): + for hydrogen in hydrogens: + h_idx = rwmol.AddAtom(hydrogen) + rwmol.AddBond(atom_idx, h_idx, Chem.BondType.SINGLE) + new = rwmol.GetMol() + # sanitize + new.UpdatePropertyCache() + sanitize_mol(new) + # reorder atoms as input atomgroup (through _MDAnalysis_index) + return reorder_atoms(new, field=index_field) diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 413f4587e3e..4ed16da9021 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -868,22 +868,60 @@ def test_atom_sorter(self): class TestRDKitTemplateInferer: @pytest.fixture(scope="function") def template(self): - params = Chem.SmilesParserParams() - params.removeHs = False - return Chem.MolFromSmiles("[H]-[N+](-[H])(-[H])-[H]", params) + return Chem.MolFromSmiles("[NH3+]-C=O") - def test_template_inferring(self, template): + @pytest.fixture(scope="function") + def template_with_hydrogens(self): params = Chem.SmilesParserParams() params.removeHs = False - params.sanitize = False - mol = Chem.MolFromSmiles("[H]-N(-[H])(-[H])-[H]", params) - assert mol.GetAtomWithIdx(1).GetFormalCharge() == 0 + return Chem.MolFromSmiles("[N+](-[H])(-[H])(-[H])-C(-[H])=O", params) + + @pytest.fixture(scope="function") + def mol(self, template): + mol = Chem.AddHs(template) + AllChem.EmbedMolecule(mol, randomSeed=0xAC1D1C) + mol.GetAtomWithIdx(0).SetFormalCharge(0) + mol.UpdatePropertyCache(strict=False) + for atom in mol.GetAtoms(): + atom.SetIntProp("_MDAnalysis_index", atom.GetIdx()) + return mol - inferer = TemplateInferer(template=template) - mol = inferer(mol) - assert mol.GetAtomWithIdx(1).GetFormalCharge() == 1 + def test_template_inferring(self, mol, template_with_hydrogens): + inferer = TemplateInferer( + template=template_with_hydrogens, adjust_hydrogens=False + ) + new = inferer(mol) + assert new.GetAtomWithIdx(0).GetFormalCharge() == 1 + + def test_template_inferring_adjust_hydrogens(self, mol, template): + inferer = TemplateInferer(template=template, adjust_hydrogens=True) + new = inferer(mol) + assert new.GetAtomWithIdx(0).GetFormalCharge() == 1 + + def test_from_convert_to_method(self, template_with_hydrogens): + mol = Chem.Mol(template_with_hydrogens) + AllChem.EmbedMolecule(mol, randomSeed=0xAC1D1C) + u = mda.Universe(mol) + mol = u.atoms.convert_to.rdkit( + inferer=TemplateInferer( + template=template_with_hydrogens, adjust_hydrogens=False + ) + ) + assert mol.GetAtomWithIdx(0).GetFormalCharge() == 1 + xyz = mol.GetConformer().GetPositions() + assert_array_equal(xyz, u.atoms.positions) - def test_from_convert_to_method(self, template): - u = mda.Universe(template) - mol = u.atoms.convert_to.rdkit(inferer=TemplateInferer(template=template)) - assert mol.GetAtomWithIdx(1).GetFormalCharge() == 1 + def test_adjust_hydrogens_bigger_mol(self): + u = mda.Universe(GRO, TRR) + elements = mda.topology.guessers.guess_types(u.atoms.names) + u.add_TopologyAttr("elements", elements) + ag = u.select_atoms("resid 1-3") + inferer = TemplateInferer( + template=Chem.MolFromSmiles( + "CCC(C)C(C=O)NC(=O)C(CCCNC(N)=[NH2+])NC(=O)C([NH3+])CCSC" + ), + adjust_hydrogens=True, + ) + mol = ag.convert_to.rdkit(inferer=inferer) + xyz = mol.GetConformer().GetPositions() + assert_array_equal(xyz, ag.positions) From a2447a1d17aa8b7a164e06dc8eba66aa1f907ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 15 Dec 2023 21:34:47 +0100 Subject: [PATCH 10/14] improve test coverage --- .../MDAnalysis/converters/RDKitInferring.py | 2 +- .../MDAnalysisTests/converters/test_rdkit.py | 35 +++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index 3930b9e4ff4..3bb53500c85 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -69,7 +69,7 @@ def reorder_atoms( return Chem.RenumberAtoms(mol, order.astype(int).tolist()) # not a molecule converted by MDAnalysis warnings.warn( - f"{field} not available on the input mol atoms, skipping reordering " + f"{field!r} not available on the input mol atoms, skipping reordering " "of atoms." ) return mol diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 4ed16da9021..7e49ccf2fad 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -25,6 +25,7 @@ import warnings from contextlib import suppress from io import StringIO +from unittest.mock import Mock import MDAnalysis as mda import numpy as np @@ -50,6 +51,8 @@ from MDAnalysis.converters.RDKitInferring import ( MDAnalysisInferer, TemplateInferer, + reorder_atoms, + sanitize_mol, ) from rdkit import Chem from rdkit.Chem import AllChem @@ -434,11 +437,16 @@ def test_sanitize_fail_warning(self): [str(r.message) for r in record] ) - def test_deprecation_maw_iter(self, mol2): + def test_deprecation_max_iter(self, mol2, monkeypatch): + mock = Mock(wraps=atomgroup_to_mol) + monkeypatch.setattr( + "MDAnalysis.converters.RDKit.atomgroup_to_mol", mock + ) with pytest.warns( DeprecationWarning, match="Using `max_iter` is deprecated" ): mol2.atoms.convert_to.rdkit(max_iter=2) + assert mock.call_args.kwargs["inferer"].max_iter == 2 def test_deprecation_NoImplicit(self, mol2): with pytest.warns( @@ -469,7 +477,30 @@ def potatoe(mol): @requires_rdkit -class TestRDKitMDAnalysisInferer(object): +class TestRDKitInferringFunctions: + def test_sanitize_mol_warning(self): + mol = Chem.MolFromSmiles("[NH4]", sanitize=False) + with pytest.warns( + match=( + "Could not sanitize molecule: failed during step " + "rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_PROPERTIES" + ) + ): + sanitize_mol(mol) + + def test_reorder_atoms_key_error(self): + mol = Chem.MolFromSmiles("CCO") + with pytest.warns( + match=( + "'_MDAnalysis_index' not available on the input mol atoms, " + "skipping reordering of atoms" + ) + ): + reorder_atoms(mol) + + +@requires_rdkit +class TestRDKitMDAnalysisInferer: def add_Hs_remove_bo_and_charges(self, mol): """Add hydrogens and remove bond orders and charges from a molecule""" mH = Chem.AddHs(mol) From aa6765caf8eb7cdd46fa5dc51e4f4791347123d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Fri, 15 Dec 2023 23:53:40 +0100 Subject: [PATCH 11/14] more nitpicks --- package/MDAnalysis/converters/RDKit.py | 6 ------ .../MDAnalysis/converters/RDKitInferring.py | 18 +++++++++++++++--- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index 109016f9883..c900dc4c376 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -68,12 +68,6 @@ .. autoclass:: RDKitConverter :members: -.. autofunction:: _infer_bo_and_charges - -.. autofunction:: _standardize_patterns - -.. autofunction:: _rebuild_conjugated_bonds - .. Links diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index 3bb53500c85..008a98383dc 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -34,6 +34,11 @@ .. autoclass:: TemplateInferer :members: + +.. autofunction:: sanitize_mol + +.. autofunction:: reorder_atoms + """ import warnings from collections import defaultdict @@ -63,7 +68,13 @@ def reorder_atoms( mol: "Chem.Mol", field: str = "_MDAnalysis_index" ) -> "Chem.Mol": - """Reorder atoms to match with the original MDAnalysis AtomGroup.""" + """Reorder atoms based on the given field. Defaults to sorting in the same + order as the input AtomGroup. + + Notes + ----- + If the field is not found on the molecule, skips reordering. + """ with suppress(KeyError): order = np.argsort([atom.GetIntProp(field) for atom in mol.GetAtoms()]) return Chem.RenumberAtoms(mol, order.astype(int).tolist()) @@ -647,9 +658,10 @@ def assign_from_template_with_adjusted_hydrogens( for atom in new.GetAtoms(): if (idx := atom.GetIntProp(index_field)) in atoms_with_hydrogens: reverse_mapping[atom.GetIdx()] = atoms_with_hydrogens[idx] - # let UpdatePropertyCache take care of recalculating the - # number of explicit hydrogens to prevent sanitization errors + # Set ExplicitH to 0 to avoid valence errors when adding Hs atom.SetNumExplicitHs(0) + # let UpdatePropertyCache recalculate the number of radicals later + atom.SetNumRadicalElectrons(0) # add missing Hs rwmol = Chem.RWMol(new) for atom_idx, hydrogens in reverse_mapping.items(): From acb20802a0f20a8c1388363f9feb5bcc0726609d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Sat, 16 Dec 2023 17:37:54 +0100 Subject: [PATCH 12/14] add rdDetermineBonds inferer --- .github/actions/setup-deps/action.yaml | 2 +- azure-pipelines.yml | 2 +- .../MDAnalysis/converters/RDKitInferring.py | 23 ++++ package/pyproject.toml | 2 +- .../MDAnalysisTests/converters/test_rdkit.py | 115 +++++++++++++----- 5 files changed, 112 insertions(+), 32 deletions(-) diff --git a/.github/actions/setup-deps/action.yaml b/.github/actions/setup-deps/action.yaml index e5a0794450c..a5163a3170a 100644 --- a/.github/actions/setup-deps/action.yaml +++ b/.github/actions/setup-deps/action.yaml @@ -79,7 +79,7 @@ inputs: pytng: default: 'pytng>=0.2.3' rdkit: - default: 'rdkit>=2020.03.1' + default: 'rdkit>=2022.09.1' scikit-learn: default: 'scikit-learn' seaborn: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 41424cf50de..a41aa2dae70 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -113,7 +113,7 @@ jobs: parmed pytng>=0.2.3 tidynamics>=1.0.0 - rdkit>=2020.03.1 + rdkit>=2022.09.1 displayName: 'Install additional dependencies for 64-bit tests' condition: and(succeeded(), eq(variables['PYTHON_ARCH'], 'x64')) - script: >- diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index 008a98383dc..53a68b48793 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -35,6 +35,9 @@ .. autoclass:: TemplateInferer :members: +.. autoclass:: RDKitInferer + :members: + .. autofunction:: sanitize_mol .. autofunction:: reorder_atoms @@ -64,6 +67,11 @@ RDBONDORDER.update({str(key): value for key, value in RDBONDORDER.items()}) PERIODIC_TABLE = Chem.GetPeriodicTable() +with suppress(ImportError): + from rdkit.Chem.rdDetermineBonds import ( + DetermineBondOrders, # available since 2022.09.1 + ) + def reorder_atoms( mol: "Chem.Mol", field: str = "_MDAnalysis_index" @@ -674,3 +682,18 @@ def assign_from_template_with_adjusted_hydrogens( sanitize_mol(new) # reorder atoms as input atomgroup (through _MDAnalysis_index) return reorder_atoms(new, field=index_field) + + +@dataclass(frozen=True) +class RDKitInferer: + """Uses RDKit's :func:`~rdkit.Chem.rdDetermineBonds.DetermineBondOrders` + to infer bond orders and formal charges. This is the same algorithm used + by the :ref:`xyz2mol ` package. + """ + + charge: int = 0 + + def __call__(self, mol: "Chem.Mol") -> "Chem.Mol": + new = Chem.Mol(mol) + DetermineBondOrders(new, charge=self.charge) + return new diff --git a/package/pyproject.toml b/package/pyproject.toml index d1e8e741c5a..8fdb1b180d0 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -79,7 +79,7 @@ extra_formats = [ "pyedr>=0.7.0", "pytng>=0.2.3", "gsd>3.0.0", - "rdkit>=2020.03.1", + "rdkit>=2021.09.1", ] analysis = [ "biopython>=1.80", diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index 7e49ccf2fad..ad1e2682a51 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -50,6 +50,7 @@ ) from MDAnalysis.converters.RDKitInferring import ( MDAnalysisInferer, + RDKitInferer, TemplateInferer, reorder_atoms, sanitize_mol, @@ -499,8 +500,7 @@ def test_reorder_atoms_key_error(self): reorder_atoms(mol) -@requires_rdkit -class TestRDKitMDAnalysisInferer: +class BaseInferer: def add_Hs_remove_bo_and_charges(self, mol): """Add hydrogens and remove bond orders and charges from a molecule""" mH = Chem.AddHs(mol) @@ -514,25 +514,6 @@ def add_Hs_remove_bo_and_charges(self, mol): mH.UpdatePropertyCache(strict=False) return mH - def enumerate_reordered_mol(self, mol): - """Enumerates all possible starting atoms for a given molecule""" - # go through each possible starting atom - for root_atom in mol.GetAtoms(): - smi = Chem.MolToSmiles(mol, rootedAtAtom=root_atom.GetIdx()) - reordered_mol = Chem.MolFromSmiles(smi, sanitize=False) - for atom in reordered_mol.GetAtoms(): - atom.SetNoImplicit(True) - reordered_mol.UpdatePropertyCache(strict=False) - yield reordered_mol - - def assign_bond_orders_and_charges(self, mol): - """Returns a sanitized molecule with infered bond orders and charges""" - inferer = MDAnalysisInferer() - inferer._infer_bo_and_charges(mol) - mol = inferer._standardize_patterns(mol) - Chem.SanitizeMol(mol) - return mol - def assert_isomorphic_resonance_structure(self, mol, ref): """Checks if 2 molecules are isomorphic using their resonance structures @@ -546,6 +527,24 @@ def assert_isomorphic_resonance_structure(self, mol, ref): isomorphic ), f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" + +@requires_rdkit +class TestRDKitMDAnalysisInferer(BaseInferer): + def enumerate_reordered_mol(self, mol): + """Enumerates all possible starting atoms for a given molecule""" + # go through each possible starting atom + for root_atom in mol.GetAtoms(): + smi = Chem.MolToSmiles(mol, rootedAtAtom=root_atom.GetIdx()) + reordered_mol = Chem.MolFromSmiles(smi, sanitize=False) + for atom in reordered_mol.GetAtoms(): + atom.SetNoImplicit(True) + reordered_mol.UpdatePropertyCache(strict=False) + yield reordered_mol + + @pytest.fixture(scope="class") + def inferer(self): + return MDAnalysisInferer() + @pytest.mark.parametrize( "smi, out", [ @@ -624,10 +623,10 @@ def test_infer_charges(self, smi, atom_idx, charge): ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), ], ) - def test_standardize_patterns(self, smi, out): + def test_standardize_patterns(self, inferer, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) - mol = self.assign_bond_orders_and_charges(mol) + mol = inferer(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) assert is_isomorphic(mol, molref), "{} != {}".format( @@ -671,7 +670,7 @@ def test_ignore_prop(self): "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", ], ) - def test_transfer_properties(self, smi): + def test_transfer_properties(self, inferer, smi): mol = Chem.MolFromSmiles(smi) mol = self.add_Hs_remove_bo_and_charges(mol) old = {} @@ -680,7 +679,7 @@ def test_transfer_properties(self, smi): atom.SetIntProp("_MDAnalysis_index", ix) atom.SetProp("dummy", f"foo_{ix}") old[ix] = {"_MDAnalysis_index": ix, "dummy": f"foo_{ix}"} - newmol = self.assign_bond_orders_and_charges(mol) + newmol = inferer(mol) new = {} for a in newmol.GetAtoms(): ix = a.GetIntProp("_MDAnalysis_index") @@ -766,13 +765,13 @@ def test_transfer_properties(self, smi): "N#Cc1c[nH]c(C(=O)Nc2ccc(-c3cccc[n+]3[O-])cc2C2=CCCCC2)n1 CHEMBL1172116", ], ) - def test_order_independant(self, smi): + def test_order_independant(self, inferer, smi): # generate mol with hydrogens but without bond orders ref = Chem.MolFromSmiles(smi) stripped_mol = self.add_Hs_remove_bo_and_charges(ref) # enumerate mols with different starting atoms for m in self.enumerate_reordered_mol(stripped_mol): - m = self.assign_bond_orders_and_charges(m) + m = inferer(m) m = Chem.RemoveHs(m) self.assert_isomorphic_resonance_structure(m, ref) @@ -830,10 +829,10 @@ def test_deprecation_warning_max_iter(self): "[Na+].[Cl-]", ], ) - def test_ions(self, smi): + def test_ions(self, inferer, smi): ref = Chem.MolFromSmiles(smi) stripped_mol = self.add_Hs_remove_bo_and_charges(ref) - mol = self.assign_bond_orders_and_charges(stripped_mol) + mol = inferer(stripped_mol) assert is_isomorphic(mol, ref) @pytest.mark.parametrize( @@ -956,3 +955,61 @@ def test_adjust_hydrogens_bigger_mol(self): mol = ag.convert_to.rdkit(inferer=inferer) xyz = mol.GetConformer().GetPositions() assert_array_equal(xyz, ag.positions) + + +@requires_rdkit +class TestRDKitInferer(BaseInferer): + @pytest.mark.parametrize( + "smi", + [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", + # amino acids + "C[C@H](N)C(=O)O", # A + "NCC(=O)O", # G + "CC[C@H](C)[C@H](N)C(=O)O", # I + "CC(C)C[C@H](N)C(=O)O", # L + "O=C(O)[C@@H]1CCCN1", # P + "CC(C)[C@H](N)C(=O)O", # V + "N[C@@H](Cc1ccccc1)C(=O)O", # F + "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W + "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y + "N[C@@H](CC(=O)O)C(=O)O", # D + "N[C@@H](CCC(=O)O)C(=O)O", # E + "N=C(N)NCCC[C@H](N)C(=O)O", # R + "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H + "NCCCC[C@H](N)C(=O)O", # K + "N[C@@H](CO)C(=O)O", # S + "C[C@@H](O)[C@H](N)C(=O)O", # T + "N[C@@H](CS)C(=O)O", # C + "CSCC[C@H](N)C(=O)O", # M + "NC(=O)C[C@H](N)C(=O)O", # N + "NC(=O)CC[C@H](N)C(=O)O", # Q + # nucleic acids + "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A + "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T + "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U + "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C + "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G + ], + ) + def test_inferring(self, smi): + # generate mol with hydrogens but without bond orders + ref = Chem.MolFromSmiles(smi) + charge = sum(atom.GetFormalCharge() for atom in ref.GetAtoms()) + inferer = RDKitInferer(charge=charge) + mol = Chem.AddHs(ref) + AllChem.EmbedMolecule(mol, randomSeed=0xAC1D1C) + stripped_mol = self.add_Hs_remove_bo_and_charges(mol) + new = inferer(stripped_mol) + new = Chem.RemoveHs(new) + self.assert_isomorphic_resonance_structure(new, ref) From f1ad0924a96cd56252f7d9d6d8530973b6c21619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 1 Apr 2024 18:32:07 +0100 Subject: [PATCH 13/14] address comments and linting --- package/CHANGELOG | 9 +++++- package/MDAnalysis/converters/RDKit.py | 32 ++++++++++++------- .../MDAnalysis/converters/RDKitInferring.py | 12 ++++--- package/pyproject.toml | 22 +++++++------ 4 files changed, 49 insertions(+), 26 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index e59f1b068d0..a3583b2258a 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -16,7 +16,7 @@ The rules for this file: ------------------------------------------------------------------------------- ??/??/?? IAlibay, HeetVekariya, marinegor, lilyminium, RMeli, ljwoods2, aditya292002, pstaerk, PicoCentauri, BFedder, - tyler.je.reddy, SampurnaM, leonwehrhan, kainszs, orionarcher + tyler.je.reddy, SampurnaM, leonwehrhan, kainszs, orionarcher, cbouy * 2.8.0 @@ -39,6 +39,8 @@ Fixes * Fix groups.py doctests using sphinx directives (Issue #3925, PR #4374) Enhancements + * Added `TemplateInferer` and `RDKitInferer` dataclasses to the + `RDKitInferring` module to be used by the RDKit converter. (PR #4305) * Added a tqdm progress bar for `MDAnalysis.analysis.pca.PCA.transform()` (PR #4531) * Improved performance of PDBWriter (Issue #2785, PR #4472) @@ -49,6 +51,9 @@ Enhancements DOI 10.1021/acs.jpcb.7b11988. (Issue #2039, PR #4524) Changes + * Refactored the RDKit converter code to move the inferring code in a separate + `RDKitInferring` module. The bond order and charges inferer has been move to + an `MDAnalysisInferer` dataclass in there. (PR #4305) * As per SPEC0 the minimum supported Python version has been raised to 3.10 (PR #4502) * MDAnalysis.analysis.hole2 is now directly imported from the mdakit @@ -66,6 +71,8 @@ Changes numpy.testing.assert_allclose #4438) Deprecations + * The RDKit converter parameter `NoImplicit` has been deprecated in favour of + `implicit_hydrogens` and `inferer` parameters. (PR #4305) * The MDAnalysis.analysis.waterdynamics module has been deprecated in favour of the waterdynamics MDAKit and will be removed in version 3.0.0 (PR #4404) * The MDAnalysis.analysis.psa module has been deprecated in favour of diff --git a/package/MDAnalysis/converters/RDKit.py b/package/MDAnalysis/converters/RDKit.py index c900dc4c376..bffdbe3a568 100644 --- a/package/MDAnalysis/converters/RDKit.py +++ b/package/MDAnalysis/converters/RDKit.py @@ -257,6 +257,9 @@ class RDKitConverter(base.ConverterBase): >>> mol = u.atoms.convert_to.rdkit(inferer=infer_from_template) + Note that all builtin inferer functions can be found in the + :mod:`RDKitInferring` module. + Since one of the main use case of the converter is converting trajectories and not just a topology, creating a new molecule from scratch for every frame would be too slow so the converter uses a caching system. The cache @@ -292,7 +295,7 @@ class RDKitConverter(base.ConverterBase): files while the original name is still available through ``atom.GetProp("_MDAnalysis_name")`` - .. versionchanged:: 2.7.0 + .. versionchanged:: 2.8.0 Deprecated ``max_iter`` (moved to the inferer class :class:`~MDAnalysis.converters.RDKitInferring.MDAnalysisInferer`) and deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added @@ -341,20 +344,20 @@ def convert( raise ImportError( "RDKit is required for the RDKitConverter but " "it's not installed. Try installing it with \n" - "conda install -c conda-forge rdkit" - ) + "`conda install -c conda-forge rdkit` or `pip install rdkit`" + ) from None try: # make sure to use atoms (Issue 46) ag = obj.atoms except AttributeError: raise TypeError( - "No `atoms` attribute in object of type {}, " - "please use a valid AtomGroup or Universe".format(type(obj)) + f"No `atoms` attribute in object of type {type(obj)!r}, " + "please use a valid AtomGroup or Universe" ) from None if (max_iter := kwargs.get("max_iter")) is not None: warnings.warn( - "Using `max_iter` is deprecated, use `MDAnalysisInferer " + "Using `max_iter` is deprecated, use `MDAnalysisInferer" "(max_iter=...)` instead", DeprecationWarning, ) @@ -427,6 +430,12 @@ def atomgroup_to_mol( inferer : Optional[Callable[[rdkit.Chem.rdchem.Mol], rdkit.Chem.rdchem.Mol]] A callable to infer bond orders and charges for the RDKit molecule created by the converter. If ``None``, inferring is skipped. + + + .. versionchanged:: 2.8.0 + Deprecated ``NoImplicit`` in favor of ``implicit_hydrogens``. Added + ``inferer`` to specify a callable that can transform the molecule (this + operation is cached). """ try: elements = ag.elements @@ -469,7 +478,7 @@ def atomgroup_to_mol( # attributes accepted in PDBResidueInfo object pdb_attrs = {} - for attr in RDATTRIBUTES.keys(): + for attr in RDATTRIBUTES: if hasattr(ag, attr): pdb_attrs[attr] = getattr(ag, attr) resnames = pdb_attrs.get("resnames", None) @@ -504,10 +513,9 @@ def get_resname(idx): _add_mda_attr_to_rdkit(attr, values[i], mi, get_resname(i)) rdatom.SetMonomerInfo(mi) # other properties - for attr in other_attrs.keys(): - value = other_attrs[attr][i] - attr = "_MDAnalysis_%s" % _TOPOLOGY_ATTRS[attr].singular - _set_atom_property(rdatom, attr, value) + for attr, values in other_attrs.items(): + attr = f"_MDAnalysis_{_TOPOLOGY_ATTRS[attr].singular}" + _set_atom_property(rdatom, attr, values[i]) _set_atom_property(rdatom, "_MDAnalysis_index", i) # add atom index = mol.AddAtom(rdatom) @@ -577,7 +585,7 @@ def _add_mda_attr_to_rdkit(attr, value, mi, resname=""): # set attribute value in RDKit MonomerInfo rdattr = RDATTRIBUTES[attr] - getattr(mi, "Set%s" % rdattr)(value) + getattr(mi, f"Set{rdattr}")(value) def _set_str_prop(atom, attr, value): diff --git a/package/MDAnalysis/converters/RDKitInferring.py b/package/MDAnalysis/converters/RDKitInferring.py index 53a68b48793..c009ff60185 100644 --- a/package/MDAnalysis/converters/RDKitInferring.py +++ b/package/MDAnalysis/converters/RDKitInferring.py @@ -5,7 +5,7 @@ # Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors # (see the file AUTHORS for the full list of names) # -# Released under the GNU Public Licence, v2 or any higher version +# Released under the GNU Lesser General Public License, v2 or higher # # Please cite your use of MDAnalysis in published work: # @@ -69,8 +69,8 @@ with suppress(ImportError): from rdkit.Chem.rdDetermineBonds import ( - DetermineBondOrders, # available since 2022.09.1 - ) + DetermineBondOrders, + ) # available since 2022.09.1 def reorder_atoms( @@ -124,7 +124,7 @@ class MDAnalysisInferer: fix challenging cases must have single reactant and product, and cannot add any atom. - .. versionadded:: 2.7.0 + .. versionadded:: 2.8.0 """ MONATOMIC_CATION_CHARGES: ClassVar[Dict[int, int]] = { @@ -624,6 +624,8 @@ class TemplateInferer: back. Useful to avoid adding explicit hydrogens on the template which can prevent RDKit from finding a match between the template and the molecule. + + .. versionadded:: 2.8.0 """ template: "Chem.Mol" @@ -689,6 +691,8 @@ class RDKitInferer: """Uses RDKit's :func:`~rdkit.Chem.rdDetermineBonds.DetermineBondOrders` to infer bond orders and formal charges. This is the same algorithm used by the :ref:`xyz2mol ` package. + + .. versionadded:: 2.8.0 """ charge: int = 0 diff --git a/package/pyproject.toml b/package/pyproject.toml index 8fdb1b180d0..6040a238c25 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -21,13 +21,13 @@ build-backend = "setuptools.build_meta" [project] name = "MDAnalysis" dynamic = ['version', 'readme'] -license = {file = "LICENSE"} +license = { file = "LICENSE" } description = "An object-oriented toolkit to analyze molecular dynamics trajectories." authors = [ - {name = 'MDAnalysis Development Team', email = 'mdanalysis@numfocus.org'} + { name = 'MDAnalysis Development Team', email = 'mdanalysis@numfocus.org' }, ] maintainers = [ - {name = 'MDAnalysis Core Developers', email = 'mdanalysis@numfocus.org'} + { name = 'MDAnalysis Core Developers', email = 'mdanalysis@numfocus.org' }, ] requires-python = ">=3.10" dependencies = [ @@ -47,8 +47,14 @@ dependencies = [ 'mdahole2', ] keywords = [ - "python", "science", "chemistry", "biophysics", "molecular-dynamics", - "computational-chemistry", "molecular-simulation", "analysis", + "python", + "science", + "chemistry", + "biophysics", + "molecular-dynamics", + "computational-chemistry", + "molecular-simulation", + "analysis", "trajectory-analysis", ] classifiers = [ @@ -79,7 +85,7 @@ extra_formats = [ "pyedr>=0.7.0", "pytng>=0.2.3", "gsd>3.0.0", - "rdkit>=2021.09.1", + "rdkit>=2022.09.1", ] analysis = [ "biopython>=1.80", @@ -117,6 +123,4 @@ zip-safe = false find = {} [tool.setuptools.package-data] -MDAnalysis = [ - 'analysis/data/*.npy', -] +MDAnalysis = ['analysis/data/*.npy'] From ea265697b22af04ffcebff2352ac01edee5f608e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Bouysset?= Date: Mon, 1 Apr 2024 19:48:16 +0100 Subject: [PATCH 14/14] revert auto-formatting --- package/pyproject.toml | 20 +- .../MDAnalysisTests/converters/test_rdkit.py | 661 ++++++++---------- 2 files changed, 284 insertions(+), 397 deletions(-) diff --git a/package/pyproject.toml b/package/pyproject.toml index 6040a238c25..84cc1db4faf 100644 --- a/package/pyproject.toml +++ b/package/pyproject.toml @@ -21,13 +21,13 @@ build-backend = "setuptools.build_meta" [project] name = "MDAnalysis" dynamic = ['version', 'readme'] -license = { file = "LICENSE" } +license = {file = "LICENSE"} description = "An object-oriented toolkit to analyze molecular dynamics trajectories." authors = [ - { name = 'MDAnalysis Development Team', email = 'mdanalysis@numfocus.org' }, + {name = 'MDAnalysis Development Team', email = 'mdanalysis@numfocus.org'} ] maintainers = [ - { name = 'MDAnalysis Core Developers', email = 'mdanalysis@numfocus.org' }, + {name = 'MDAnalysis Core Developers', email = 'mdanalysis@numfocus.org'} ] requires-python = ">=3.10" dependencies = [ @@ -47,14 +47,8 @@ dependencies = [ 'mdahole2', ] keywords = [ - "python", - "science", - "chemistry", - "biophysics", - "molecular-dynamics", - "computational-chemistry", - "molecular-simulation", - "analysis", + "python", "science", "chemistry", "biophysics", "molecular-dynamics", + "computational-chemistry", "molecular-simulation", "analysis", "trajectory-analysis", ] classifiers = [ @@ -123,4 +117,6 @@ zip-safe = false find = {} [tool.setuptools.package-data] -MDAnalysis = ['analysis/data/*.npy'] +MDAnalysis = [ + 'analysis/data/*.npy', +] \ No newline at end of file diff --git a/testsuite/MDAnalysisTests/converters/test_rdkit.py b/testsuite/MDAnalysisTests/converters/test_rdkit.py index ad1e2682a51..4a8d40733dd 100644 --- a/testsuite/MDAnalysisTests/converters/test_rdkit.py +++ b/testsuite/MDAnalysisTests/converters/test_rdkit.py @@ -31,48 +31,36 @@ import numpy as np import pytest from MDAnalysis.topology.guessers import guess_atom_element -from MDAnalysisTests.datafiles import ( - GRO, - TRR, - PDB_full, - PDB_helix, - mol2_molecule, -) +from MDAnalysisTests.datafiles import (GRO, TRR, PDB_full, PDB_helix, + mol2_molecule) from MDAnalysisTests.util import import_not_available from numpy.testing import assert_allclose, assert_array_equal, assert_equal with suppress(ImportError): - from MDAnalysis.converters.RDKit import ( - RDATTRIBUTES, - _add_mda_attr_to_rdkit, - _set_atom_property, - atomgroup_to_mol, - ) - from MDAnalysis.converters.RDKitInferring import ( - MDAnalysisInferer, - RDKitInferer, - TemplateInferer, - reorder_atoms, - sanitize_mol, - ) + from MDAnalysis.converters.RDKit import (RDATTRIBUTES, + _add_mda_attr_to_rdkit, + _set_atom_property, + atomgroup_to_mol) + from MDAnalysis.converters.RDKitInferring import (MDAnalysisInferer, + RDKitInferer, + TemplateInferer, + reorder_atoms, + sanitize_mol) from rdkit import Chem from rdkit.Chem import AllChem -requires_rdkit = pytest.mark.skipif( - import_not_available("rdkit"), reason="requires RDKit" -) +requires_rdkit = pytest.mark.skipif(import_not_available("rdkit"), + reason="requires RDKit") -@pytest.mark.skipif( - not import_not_available("rdkit"), reason="only for min dependencies build" -) +@pytest.mark.skipif(not import_not_available("rdkit"), + reason="only for min dependencies build") class TestRequiresRDKit(object): def test_converter_requires_rdkit(self): u = mda.Universe(PDB_full) - with pytest.raises( - ImportError, match="RDKit is required for the RDKitConverter" - ): + with pytest.raises(ImportError, + match="RDKit is required for the RDKitConverter"): u.atoms.convert_to("RDKIT") @@ -118,28 +106,22 @@ def product(request): def is_isomorphic(mol, ref, useChirality=False): - return mol.HasSubstructMatch( - ref, useChirality=useChirality - ) and ref.HasSubstructMatch(mol, useChirality=useChirality) + return (mol.HasSubstructMatch(ref, useChirality=useChirality) + and ref.HasSubstructMatch(mol, useChirality=useChirality)) @requires_rdkit class TestRDKitReader(object): - @pytest.mark.parametrize( - "rdmol, n_frames", - [ - ("mol2_mol", 1), - ("smiles_mol", 3), - ], - indirect=["rdmol"], - ) + @pytest.mark.parametrize("rdmol, n_frames", [ + ("mol2_mol", 1), + ("smiles_mol", 3), + ], indirect=["rdmol"]) def test_coordinates(self, rdmol, n_frames): universe = mda.Universe(rdmol) assert universe.trajectory.n_frames == n_frames - expected = np.array( - [conf.GetPositions() for conf in rdmol.GetConformers()], - dtype=np.float32, - ) + expected = np.array([ + conf.GetPositions() for conf in rdmol.GetConformers()], + dtype=np.float32) assert_equal(expected, universe.trajectory.coordinate_array) def test_no_coordinates(self): @@ -153,9 +135,8 @@ def test_compare_mol2reader(self): universe = mda.Universe(MolFactory.mol2_mol()) mol2 = mda.Universe(mol2_molecule) assert universe.trajectory.n_frames == mol2.trajectory.n_frames - assert_equal( - universe.trajectory.ts.positions, mol2.trajectory.ts.positions - ) + assert_equal(universe.trajectory.ts.positions, + mol2.trajectory.ts.positions) @requires_rdkit @@ -168,9 +149,8 @@ def pdb(self): def mol2(self): u = mda.Universe(mol2_molecule) # add elements - elements = np.array( - [guess_atom_element(x) for x in u.atoms.types], dtype=object - ) + elements = np.array([guess_atom_element(x) for x in u.atoms.types], + dtype=object) u.add_TopologyAttr("elements", elements) return u @@ -192,35 +172,28 @@ def test_no_atoms_attr(self): @pytest.mark.parametrize("smi", ["[H]", "C", "O", "[He]"]) def test_single_atom_mol(self, smi): - u = mda.Universe.from_smiles( - smi, addHs=False, generate_coordinates=False - ) + u = mda.Universe.from_smiles(smi, addHs=False, + generate_coordinates=False) mol = u.atoms.convert_to.rdkit(inferer=None) assert mol.GetNumAtoms() == 1 assert mol.GetAtomWithIdx(0).GetSymbol() == smi.strip("[]") - @pytest.mark.parametrize( - "resname, n_atoms, n_fragments", - [ - ("PRO", 14, 1), - ("ILE", 38, 1), - ("ALA", 20, 2), - ("GLY", 21, 3), - ], - ) + @pytest.mark.parametrize("resname, n_atoms, n_fragments", [ + ("PRO", 14, 1), + ("ILE", 38, 1), + ("ALA", 20, 2), + ("GLY", 21, 3), + ]) def test_mol_from_selection(self, peptide, resname, n_atoms, n_fragments): mol = peptide.select_atoms("resname %s" % resname).convert_to("RDKIT") assert n_atoms == mol.GetNumAtoms() assert n_fragments == len(Chem.GetMolFrags(mol)) - @pytest.mark.parametrize( - "sel_str, atom_index", - [ - ("resid 1", 0), - ("resname LYS and name NZ", 1), - ("resid 34 and altloc B", 2), - ], - ) + @pytest.mark.parametrize("sel_str, atom_index", [ + ("resid 1", 0), + ("resname LYS and name NZ", 1), + ("resid 34 and altloc B", 2), + ]) def test_monomer_info(self, pdb, sel_str, atom_index): sel = pdb.select_atoms(sel_str) mda_atom = sel.atoms[atom_index] @@ -237,9 +210,8 @@ def test_monomer_info(self, pdb, sel_str, atom_index): assert mda_atom.segindex == mi.GetSegmentNumber() assert mda_atom.tempfactor == mi.GetTempFactor() - @pytest.mark.parametrize( - "rdmol", ["mol2_mol", "smiles_mol"], indirect=True - ) + @pytest.mark.parametrize("rdmol", ["mol2_mol", "smiles_mol"], + indirect=True) def test_identical_topology(self, rdmol): u = mda.Universe(rdmol) umol = u.atoms.convert_to("RDKIT") @@ -249,8 +221,10 @@ def test_identical_topology(self, rdmol): assert_equal(u.atoms.elements, u2.atoms.elements) assert_equal(u.atoms.names, u2.atoms.names) assert_allclose( - u.atoms.positions, u2.atoms.positions, rtol=0, atol=1e-7 - ) + u.atoms.positions, + u2.atoms.positions, + rtol=0, + atol=1e-7) def test_raise_requires_elements(self): u = mda.Universe(mol2_molecule) @@ -260,15 +234,14 @@ def test_raise_requires_elements(self): with pytest.raises( AttributeError, - match="`elements` attribute is required for the RDKitConverter", + match="`elements` attribute is required for the RDKitConverter" ): u.atoms.convert_to("RDKIT") def test_warn_guess_bonds(self): u = mda.Universe(PDB_helix) - with pytest.warns( - UserWarning, match="No `bonds` attribute in this AtomGroup" - ): + with pytest.warns(UserWarning, + match="No `bonds` attribute in this AtomGroup"): u.atoms.convert_to("RDKIT") def test_bonds_outside_sel(self): @@ -277,44 +250,39 @@ def test_bonds_outside_sel(self): ag.convert_to.rdkit(inferer=None) def test_error_no_hydrogen(self, uo2): - with pytest.raises( - AttributeError, - match="the converter requires all hydrogens to be " "explicit", - ): + with pytest.raises(AttributeError, + match="the converter requires all hydrogens to be " + "explicit"): uo2.atoms.convert_to("RDKIT") - def test_error_no_hydrogen_skip_inferring(self, uo2): + def test_error_no_hydrogen_implicit(self, uo2): with warnings.catch_warnings(): warnings.simplefilter("error") uo2.atoms.convert_to.rdkit(inferer=None) def test_warning_no_hydrogen_force(self, uo2): - with pytest.warns( - UserWarning, match="Forcing to continue the conversion" - ): + with pytest.warns(UserWarning, + match="Forcing to continue the conversion"): uo2.atoms.convert_to.rdkit(force=True) - @pytest.mark.parametrize( - "attr, value, expected", - [ - ("names", "N", " N "), - ("names", "CA", " CA "), - ("names", "CAT", " CAT"), - ("names", "N1", " N1 "), - ("names", "CE2", " CE2"), - ("names", "C12", " C12"), - ("names", "HD12", "HD12"), - ("names", "C123", "C123"), - ("altLocs", "A", "A"), - ("chainIDs", "B", "B"), - ("icodes", "C", "C"), - ("occupancies", 0.5, 0.5), - ("resnames", "LIG", "LIG"), - ("resids", 123, 123), - ("segindices", 1, 1), - ("tempfactors", 0.8, 0.8), - ], - ) + @pytest.mark.parametrize("attr, value, expected", [ + ("names", "N", " N "), + ("names", "CA", " CA "), + ("names", "CAT", " CAT"), + ("names", "N1", " N1 "), + ("names", "CE2", " CE2"), + ("names", "C12", " C12"), + ("names", "HD12", "HD12"), + ("names", "C123", "C123"), + ("altLocs", "A", "A"), + ("chainIDs", "B", "B"), + ("icodes", "C", "C"), + ("occupancies", 0.5, 0.5), + ("resnames", "LIG", "LIG"), + ("resids", 123, 123), + ("segindices", 1, 1), + ("tempfactors", 0.8, 0.8), + ]) def test_add_mda_attr_to_rdkit(self, attr, value, expected): mi = Chem.AtomPDBResidueInfo() _add_mda_attr_to_rdkit(attr, value, mi) @@ -330,13 +298,10 @@ def test_other_attributes(self, mol2, idx): assert mda_atom.segid == rdatom.GetProp("_MDAnalysis_segid") assert mda_atom.type == rdatom.GetProp("_MDAnalysis_type") - @pytest.mark.parametrize( - "sel_str", - [ - "resname ALA", - "resname PRO and segid A", - ], - ) + @pytest.mark.parametrize("sel_str", [ + "resname ALA", + "resname PRO and segid A", + ]) def test_index_property(self, pdb, sel_str): ag = pdb.select_atoms(sel_str) mol = ag.convert_to.rdkit(inferer=None) @@ -356,8 +321,7 @@ def test_assign_stereochemistry(self, mol2): def test_trajectory_coords(self): u = mda.Universe.from_smiles( - "CCO", numConfs=3, rdkit_kwargs=dict(randomSeed=42) - ) + "CCO", numConfs=3, rdkit_kwargs=dict(randomSeed=42)) for ts in u.trajectory: mol = u.atoms.convert_to("RDKIT") positions = mol.GetConformer().GetPositions() @@ -435,36 +399,30 @@ def test_sanitize_fail_warning(self): with pytest.warns() as record: u.atoms.convert_to.rdkit(inferer=None) assert "Could not sanitize molecule" not in "\n".join( - [str(r.message) for r in record] - ) + [str(r.message)for r in record]) def test_deprecation_max_iter(self, mol2, monkeypatch): mock = Mock(wraps=atomgroup_to_mol) - monkeypatch.setattr( - "MDAnalysis.converters.RDKit.atomgroup_to_mol", mock - ) - with pytest.warns( - DeprecationWarning, match="Using `max_iter` is deprecated" - ): + monkeypatch.setattr("MDAnalysis.converters.RDKit.atomgroup_to_mol", + mock) + with pytest.warns(DeprecationWarning, + match="Using `max_iter` is deprecated"): mol2.atoms.convert_to.rdkit(max_iter=2) assert mock.call_args.kwargs["inferer"].max_iter == 2 def test_deprecation_NoImplicit(self, mol2): - with pytest.warns( - DeprecationWarning, match="Using `NoImplicit` is deprecated" - ): + with pytest.warns(DeprecationWarning, + match="Using `NoImplicit` is deprecated"): mol2.atoms.convert_to.rdkit(NoImplicit=True) def test_deprecation_atomgroup_to_mol_NoImplicit(self, mol2): - with pytest.warns( - DeprecationWarning, match="Using `NoImplicit` is deprecated" - ): + with pytest.warns(DeprecationWarning, + match="Using `NoImplicit` is deprecated"): atomgroup_to_mol(mol2.atoms, NoImplicit=False) def test_atomgroup_to_mol_unexpected_kwargs(self, mol2): - with pytest.raises( - ValueError, match="Found unexpected arguments: {'foo': 'bar'}" - ): + with pytest.raises(ValueError, + match="Found unexpected arguments: {'foo': 'bar'}"): atomgroup_to_mol(mol2.atoms, foo="bar") def test_custom_callable_inferer(self, mol2): @@ -481,22 +439,15 @@ def potatoe(mol): class TestRDKitInferringFunctions: def test_sanitize_mol_warning(self): mol = Chem.MolFromSmiles("[NH4]", sanitize=False) - with pytest.warns( - match=( - "Could not sanitize molecule: failed during step " - "rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_PROPERTIES" - ) - ): + with pytest.warns(match="Could not sanitize molecule: failed during " + "step rdkit.Chem.rdmolops.SanitizeFlags" + ".SANITIZE_PROPERTIES"): sanitize_mol(mol) def test_reorder_atoms_key_error(self): mol = Chem.MolFromSmiles("CCO") - with pytest.warns( - match=( - "'_MDAnalysis_index' not available on the input mol atoms, " - "skipping reordering of atoms" - ) - ): + with pytest.warns(match="'_MDAnalysis_index' not available on the" + " input mol atoms, skipping reordering of atoms"): reorder_atoms(mol) @@ -515,17 +466,14 @@ def add_Hs_remove_bo_and_charges(self, mol): return mH def assert_isomorphic_resonance_structure(self, mol, ref): - """Checks if 2 molecules are isomorphic using their resonance - structures + """ + Checks if 2 molecules are isomorphic using their resonance structures """ isomorphic = mol.HasSubstructMatch(ref) if not isomorphic: - isomorphic = bool( - Chem.ResonanceMolSupplier(mol).GetSubstructMatch(ref) - ) - assert ( - isomorphic - ), f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" + isomorphic = bool(Chem.ResonanceMolSupplier(mol) + .GetSubstructMatch(ref)) + assert isomorphic, f"{Chem.MolToSmiles(ref)} != {Chem.MolToSmiles(mol)}" @requires_rdkit @@ -545,27 +493,22 @@ def enumerate_reordered_mol(self, mol): def inferer(self): return MDAnalysisInferer() - @pytest.mark.parametrize( - "smi, out", - [ - ("C(-[H])(-[H])(-[H])-[H]", "C"), - ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), - ( - "[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", - "c1ccccc1", - ), - ("C-[C](-[H])-[O]", "C(=O)C"), - ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), - ("[N]-[C]-[H]", "N#C"), - ("C-[C](-[O]-[H])-[O]", "CC(=O)O"), - ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", "P(O)(O)(O)=O"), - ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", "P([O-])(O)(O)=O"), - ("[P](-[O]-[H])(-[O])(-[O])-[O]", "P([O-])([O-])(O)=O"), - ("[P](-[O])(-[O])(-[O])-[O]", "P([O-])([O-])([O-])=O"), - ("[H]-[O]-[N]-[O]", "ON=O"), - ("[N]-[C]-[O]", "N#C[O-]"), - ], - ) + @pytest.mark.parametrize("smi, out", [ + ("C(-[H])(-[H])(-[H])-[H]", "C"), + ("[C](-[H])(-[H])-[C](-[H])-[H]", "C=C"), + ("[C]1(-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C]1(-[H])", + "c1ccccc1"), + ("C-[C](-[H])-[O]", "C(=O)C"), + ("[H]-[C](-[O])-[N](-[H])-[H]", "C(=O)N"), + ("[N]-[C]-[H]", "N#C"), + ("C-[C](-[O]-[H])-[O]", "CC(=O)O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O]-[H])-[O]", "P(O)(O)(O)=O"), + ("[P](-[O]-[H])(-[O]-[H])(-[O])-[O]", "P([O-])(O)(O)=O"), + ("[P](-[O]-[H])(-[O])(-[O])-[O]", "P([O-])([O-])(O)=O"), + ("[P](-[O])(-[O])(-[O])-[O]", "P([O-])([O-])([O-])=O"), + ("[H]-[O]-[N]-[O]", "ON=O"), + ("[N]-[C]-[O]", "N#C[O-]"), + ]) def test_infer_bond_orders(self, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) @@ -574,19 +517,14 @@ def test_infer_bond_orders(self, smi, out): Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert is_isomorphic(mol, molref), "{} != {}".format( - Chem.MolToSmiles(mol), out - ) - - @pytest.mark.parametrize( - "smi, atom_idx, charge", - [ - ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), - ("[N]-[C]-[O]", 2, -1), - ("[N](-[H])(-[H])(-[H])-[H]", 0, 1), - ("C-[C](-[O])-[O]", 3, -1), - ], - ) + assert is_isomorphic(mol, molref), f"{Chem.MolToSmiles(mol)} != {out}" + + @pytest.mark.parametrize("smi, atom_idx, charge", [ + ("[C](-[H])(-[H])(-[H])-[O]", 4, -1), + ("[N]-[C]-[O]", 2, -1), + ("[N](-[H])(-[H])(-[H])-[H]", 0, 1), + ("C-[C](-[O])-[O]", 3, -1), + ]) def test_infer_charges(self, smi, atom_idx, charge): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) @@ -595,62 +533,48 @@ def test_infer_charges(self, smi, atom_idx, charge): Chem.SanitizeMol(mol) assert mol.GetAtomWithIdx(atom_idx).GetFormalCharge() == charge - @pytest.mark.parametrize( - "smi, out", - [ - ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), - ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), - ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), - ( - "C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", - "CNC(N)=[N+](-[H])-[H]", - ), - ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), - ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), - ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), - ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), - ("[C-](=O)-C", "[C](=O)-C"), - ("[H]-[N-]-C", "[H]-[N]-C"), - ( - "[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", - "[O-]c1ccccc1", - ), - ( - "[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", - "[O-]C1=CC=CC1=O", - ), - ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), - ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), - ], - ) + @pytest.mark.parametrize("smi, out", [ + ("[S](-[O]-[H])(-[O]-[H])(-[O])-[O]", "S(=O)(=O)(O)O"), + ("[S](-[O]-[H])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])O"), + ("[S](-[O])(-[O])(-[O])-[O]", "S(=O)(=O)([O-])[O-]"), + ("C-[N](-[H])-[C](-[N](-[H])-[H])-[N](-[H])-[H]", + "CNC(N)=[N+](-[H])-[H]"), + ("[O]-[C](-[H])-[C](-[H])-[H]", "C([O-])=C"), + ("C-[N](-[O])-[O]", "C[N+](=O)[O-]"), + ("C(-[N](-[O])-[O])-[N](-[O])-[O]", "C([N+](=O)[O-])[N+](=O)[O-]"), + ("C-[N](-[O])-[O].C-[N](-[O])-[O]", "C[N+](=O)[O-].C[N+](=O)[O-]"), + ("[C-](=O)-C", "[C](=O)-C"), + ("[H]-[N-]-C", "[H]-[N]-C"), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])-[C](-[H])1", + "[O-]c1ccccc1"), + ("[O]-[C]1-[C](-[H])-[C](-[H])-[C](-[H])-[C]1-[O]", + "[O-]C1=CC=CC1=O"), + ("[H]-[C]-[C]-[C](-[H])-[C](-[H])-[H]", "C#CC=C"), + ("[H]-[C]-[C]-[C]-[C]-[H]", "C#CC#C"), + ]) def test_standardize_patterns(self, inferer, smi, out): mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) mol = inferer(mol) mol = Chem.RemoveHs(mol) molref = Chem.MolFromSmiles(out) - assert is_isomorphic(mol, molref), "{} != {}".format( - Chem.MolToSmiles(mol), out - ) - - @pytest.mark.parametrize( - "attr, value, getter", - [ - ("index", 42, "GetIntProp"), - ("index", np.int8(42), "GetIntProp"), - ("index", np.int16(42), "GetIntProp"), - ("index", np.int32(42), "GetIntProp"), - ("index", np.int64(42), "GetIntProp"), - ("index", np.uint8(42), "GetIntProp"), - ("index", np.uint16(42), "GetIntProp"), - ("index", np.uint32(42), "GetIntProp"), - ("index", np.uint64(42), "GetIntProp"), - ("charge", 4.2, "GetDoubleProp"), - ("charge", np.float32(4.2), "GetDoubleProp"), - ("charge", np.float64(4.2), "GetDoubleProp"), - ("type", "C.3", "GetProp"), - ], - ) + assert is_isomorphic(mol, molref), f"{Chem.MolToSmiles(mol)} != {out}" + + @pytest.mark.parametrize("attr, value, getter", [ + ("index", 42, "GetIntProp"), + ("index", np.int8(42), "GetIntProp"), + ("index", np.int16(42), "GetIntProp"), + ("index", np.int32(42), "GetIntProp"), + ("index", np.int64(42), "GetIntProp"), + ("index", np.uint8(42), "GetIntProp"), + ("index", np.uint16(42), "GetIntProp"), + ("index", np.uint32(42), "GetIntProp"), + ("index", np.uint64(42), "GetIntProp"), + ("charge", 4.2, "GetDoubleProp"), + ("charge", np.float32(4.2), "GetDoubleProp"), + ("charge", np.float64(4.2), "GetDoubleProp"), + ("type", "C.3", "GetProp"), + ]) def test_set_atom_property(self, attr, value, getter): atom = Chem.Atom(1) prop = "_MDAnalysis_%s" % attr @@ -662,14 +586,11 @@ def test_ignore_prop(self): _set_atom_property(atom, "foo", {"bar": "baz"}) assert "foo" not in atom.GetPropsAsDict().items() - @pytest.mark.parametrize( - "smi", - [ - "c1ccc(cc1)-c1ccccc1-c1ccccc1", - "c1cc[nH]c1", - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - ], - ) + @pytest.mark.parametrize("smi", [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + ]) def test_transfer_properties(self, inferer, smi): mol = Chem.MolFromSmiles(smi) mol = self.add_Hs_remove_bo_and_charges(mol) @@ -683,88 +604,86 @@ def test_transfer_properties(self, inferer, smi): new = {} for a in newmol.GetAtoms(): ix = a.GetIntProp("_MDAnalysis_index") - new[ix] = {"_MDAnalysis_index": ix, "dummy": a.GetProp("dummy")} + new[ix] = {"_MDAnalysis_index": ix, + "dummy": a.GetProp("dummy")} props = a.GetPropsAsDict().keys() assert "old_mapno" not in props assert "react_atom_idx" not in props assert new == old - @pytest.mark.parametrize( - "smi", - [ - "c1ccc(cc1)-c1ccccc1-c1ccccc1", - "c1cc[nH]c1", - "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", - "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", - "c1csc(c1)-c1ccoc1-c1cc[nH]c1", - "C1=C2C(=NC=N1)N=CN2", - "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", - "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", - "C=CC=CC=CC=CC=CC=C", - "NCCCCC([NH3+])C(=O)[O-]", - "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", - "C#CC=C", - # HID HIE HIP residues, see PR #2941 - "O=C([C@H](CC1=CNC=N1)N)O", - "O=C([C@H](CC1=CN=CN1)N)O", - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - # fixes from PR #3044 - "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", - "[O-][n+]1cccnc1", - "C[n+]1ccccc1", - "[PH4+]", - "c1nc[nH]n1", - "CC(=O)C=C(C)N", - "CC(C)=CC=C[O-]", - "O=S(C)(C)=NC", - "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", - "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", - "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", - "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", - "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", - "[O-][n+]1onc2ccccc21", - "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1", - "[O-]c1ccccc1", - "[O-]C=CC=CCC=CC=[N+](C)C", - "C=[N+](-[O-])-C", - "C-[N-]-C(=O)C", - # amino acids - "C[C@H](N)C(=O)O", # A - "NCC(=O)O", # G - "CC[C@H](C)[C@H](N)C(=O)O", # I - "CC(C)C[C@H](N)C(=O)O", # L - "O=C(O)[C@@H]1CCCN1", # P - "CC(C)[C@H](N)C(=O)O", # V - "N[C@@H](Cc1ccccc1)C(=O)O", # F - "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W - "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y - "N[C@@H](CC(=O)O)C(=O)O", # D - "N[C@@H](CCC(=O)O)C(=O)O", # E - "N=C(N)NCCC[C@H](N)C(=O)O", # R - "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H - "NCCCC[C@H](N)C(=O)O", # K - "N[C@@H](CO)C(=O)O", # S - "C[C@@H](O)[C@H](N)C(=O)O", # T - "N[C@@H](CS)C(=O)O", # C - "CSCC[C@H](N)C(=O)O", # M - "NC(=O)C[C@H](N)C(=O)O", # N - "NC(=O)CC[C@H](N)C(=O)O", # Q - # nucleic acids - "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A - "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T - "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U - "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C - "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G - # RDKitConverter benchmark 2022-05-23, fixed by better sorting - "CC(C)NS(C)(=O)=Nc1ccc(-c2ccc(-c3nc4[nH]c(O[C@@H]5CO[C@@H]6[C@H](O)CO[C@H]56)nc4cc3Cl)cc2)cc1 CHEMBL4111464", - "C[n+]1c2ccccc2c(C(=O)O)c2[nH]c3ccc(Br)cc3c21 CHEMBL511591", - "C[n+]1ccccc1-c1ccc(NC(=O)c2ccc(C(=O)Nc3ccc(-c4cccc[n+]4C)cc3)cc2)cc1 CHEMBL3230729", - "Cc1ccc(/C(O)=C(/C([S-])=NCc2ccccc2)[n+]2cccc(CO)c2)cc1[N+](=O)[O-] CHEMBL1596426", - "CC(C)(O)c1cc2c(cc1C(C)(C)O)-c1ccccc1-2 CHEMBL1990966", - "[O-]/N=C1/c2ccccc2-c2nc3ccc(C(F)(F)F)cc3nc21 CHEMBL4557878", - "N#Cc1c[nH]c(C(=O)Nc2ccc(-c3cccc[n+]3[O-])cc2C2=CCCCC2)n1 CHEMBL1172116", - ], - ) + @pytest.mark.parametrize("smi", [ + "c1ccc(cc1)-c1ccccc1-c1ccccc1", + "c1cc[nH]c1", + "c1ccc(cc1)-c1ccc(-c2ccccc2)c(-c2ccccc2)c1-c1ccccc1", + "c1ccc2c(c1)c1ccccc1c1ccccc1c1ccccc1c1ccccc21", + "c1csc(c1)-c1ccoc1-c1cc[nH]c1", + "C1=C2C(=NC=N1)N=CN2", + "CN1C=NC(=C1SC2=NC=NC3=C2NC=N3)[N+](=O)[O-]", + "c1c[nH]c(c1)-c1ccc(s1)-c1ccoc1-c1c[nH]cc1-c1ccccc1", + "C=CC=CC=CC=CC=CC=C", + "NCCCCC([NH3+])C(=O)[O-]", + "CC(C=CC1=C(C)CCCC1(C)C)=CC=CC(C)=CC=[NH+]C", + "C#CC=C", + # HID HIE HIP residues, see PR #2941 + "O=C([C@H](CC1=CNC=N1)N)O", + "O=C([C@H](CC1=CN=CN1)N)O", + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + # fixes from PR #3044 + "CCOC(=O)c1cc2cc(C(=O)O)ccc2[nH]1", + "[O-][n+]1cccnc1", + "C[n+]1ccccc1", + "[PH4+]", + "c1nc[nH]n1", + "CC(=O)C=C(C)N", + "CC(C)=CC=C[O-]", + "O=S(C)(C)=NC", + "Cc1ccc2c3ncn(Cc4ccco4)c(O)c-3nc2c1", + "CCCC/C=C/C#CC#CCCCCCCCC(=O)O", + "c1c2c(=O)n3cccc(C)c3nc2n(C)c(=N)c1C(=O)NCc1cnccc1", + "N#Cc1c[nH]c(C(=O)NC(=O)c2cc[n+]([O-])cc2)n1", + "C[C@@H](Oc1cc(F)ccc1Nc1ncnc2cc(N=S3(=O)CCC3)cc(F)c12)C(=O)NCC#N", + "[O-][n+]1onc2ccccc21", + "Cc1cc[n+](CC[n+]2ccc(C)cc2)cc1", + "[O-]c1ccccc1", + "[O-]C=CC=CCC=CC=[N+](C)C", + "C=[N+](-[O-])-C", + "C-[N-]-C(=O)C", + # amino acids + "C[C@H](N)C(=O)O", # A + "NCC(=O)O", # G + "CC[C@H](C)[C@H](N)C(=O)O", # I + "CC(C)C[C@H](N)C(=O)O", # L + "O=C(O)[C@@H]1CCCN1", # P + "CC(C)[C@H](N)C(=O)O", # V + "N[C@@H](Cc1ccccc1)C(=O)O", # F + "N[C@@H](Cc1c[nH]c2ccccc12)C(=O)O", # W + "N[C@@H](Cc1ccc(O)cc1)C(=O)O", # Y + "N[C@@H](CC(=O)O)C(=O)O", # D + "N[C@@H](CCC(=O)O)C(=O)O", # E + "N=C(N)NCCC[C@H](N)C(=O)O", # R + "N[C@@H](Cc1c[nH]cn1)C(=O)O", # H + "NCCCC[C@H](N)C(=O)O", # K + "N[C@@H](CO)C(=O)O", # S + "C[C@@H](O)[C@H](N)C(=O)O", # T + "N[C@@H](CS)C(=O)O", # C + "CSCC[C@H](N)C(=O)O", # M + "NC(=O)C[C@H](N)C(=O)O", # N + "NC(=O)CC[C@H](N)C(=O)O", # Q + # nucleic acids + "Nc1ncnc2c1ncn2[C@H]1C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O1", # A + "Cc1cn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]c1=O", # T + "O=c1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # U + "Nc1ccn([C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)n1", # C + "Nc1nc2c(ncn2[C@H]2C[C@H](OP(=O)(O)O)[C@@H](COP(=O)(O)O)O2)c(=O)[nH]1", # G + # RDKitConverter benchmark 2022-05-23, fixed by better sorting + "CC(C)NS(C)(=O)=Nc1ccc(-c2ccc(-c3nc4[nH]c(O[C@@H]5CO[C@@H]6[C@H](O)CO[C@H]56)nc4cc3Cl)cc2)cc1 CHEMBL4111464", + "C[n+]1c2ccccc2c(C(=O)O)c2[nH]c3ccc(Br)cc3c21 CHEMBL511591", + "C[n+]1ccccc1-c1ccc(NC(=O)c2ccc(C(=O)Nc3ccc(-c4cccc[n+]4C)cc3)cc2)cc1 CHEMBL3230729", + "Cc1ccc(/C(O)=C(/C([S-])=NCc2ccccc2)[n+]2cccc(CO)c2)cc1[N+](=O)[O-] CHEMBL1596426", + "CC(C)(O)c1cc2c(cc1C(C)(C)O)-c1ccccc1-2 CHEMBL1990966", + "[O-]/N=C1/c2ccccc2-c2nc3ccc(C(F)(F)F)cc3nc21 CHEMBL4557878", + "N#Cc1c[nH]c(C(=O)Nc2ccc(-c3cccc[n+]3[O-])cc2C2=CCCCC2)n1 CHEMBL1172116", + ]) def test_order_independant(self, inferer, smi): # generate mol with hydrogens but without bond orders ref = Chem.MolFromSmiles(smi) @@ -776,15 +695,12 @@ def test_order_independant(self, inferer, smi): self.assert_isomorphic_resonance_structure(m, ref) @pytest.mark.xfail(reason="Not currently tackled by the RDKitConverter") - @pytest.mark.parametrize( - "smi", - [ - "C-[N+]#N", - "C-N=[N+]=[N-]", - "C-[O+]=C", - "C-[N+]#[C-]", - ], - ) + @pytest.mark.parametrize("smi", [ + "C-[N+]#N", + "C-N=[N+]=[N-]", + "C-[O+]=C", + "C-[N+]#[C-]", + ]) def test_order_independant_issue_3339(self, smi): self.test_order_independant(smi) @@ -792,56 +708,37 @@ def test_warn_conjugated_max_iter(self): smi = "[C-]C=CC=CC=CC=CC=CC=C[C-]" mol = Chem.MolFromSmiles(smi) inferer = MDAnalysisInferer(max_iter=2) - with pytest.warns( - UserWarning, match="reasonable number of iterations" - ): + with pytest.warns(UserWarning, + match="reasonable number of iterations"): inferer._rebuild_conjugated_bonds(mol) def test_deprecation_warning_max_iter(self): smi = "c1ccccc1" mol = Chem.MolFromSmiles(smi) inferer = MDAnalysisInferer() - with pytest.warns( - DeprecationWarning, - match="Specifying `max_iter` is deprecated and will be removed", - ): + with pytest.warns(DeprecationWarning, + match="Specifying `max_iter` is deprecated and will" + " be removed"): inferer._standardize_patterns(mol, max_iter=1) - @pytest.mark.parametrize( - "smi", - [ - "[Li+]", - "[Na+]", - "[K+]", - "[Rb+]", - "[Ag+]", - "[Cs+]", - "[Mg+2]", - "[Ca+2]", - "[Cu+2]", - "[Zn+2]", - "[Sr+2]", - "[Ba+2]", - "[Al+3]", - "[Fe+2]", - "[Cl-]", - "[O-2]", - "[Na+].[Cl-]", - ], - ) + @pytest.mark.parametrize("smi", [ + "[Li+]", "[Na+]", "[K+]", "[Rb+]", "[Ag+]", "[Cs+]", + "[Mg+2]", "[Ca+2]", "[Cu+2]", "[Zn+2]", "[Sr+2]", "[Ba+2]", + "[Al+3]", "[Fe+2]", + "[Cl-]", + "[O-2]", + "[Na+].[Cl-]", + ]) def test_ions(self, inferer, smi): ref = Chem.MolFromSmiles(smi) stripped_mol = self.add_Hs_remove_bo_and_charges(ref) mol = inferer(stripped_mol) assert is_isomorphic(mol, ref) - @pytest.mark.parametrize( - "smi", - [ - "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", - "O=S(C)(C)=NC", - ], - ) + @pytest.mark.parametrize("smi", [ + "O=C([C@H](CC1=C[NH1+]=CN1)[NH3+])[O-]", + "O=S(C)(C)=NC", + ]) def test_reorder_atoms(self, smi): mol = Chem.MolFromSmiles(smi) mol = Chem.AddHs(mol) @@ -859,19 +756,15 @@ def test_pdb_names(self): u = mda.Universe(PDB_helix) mol = u.atoms.convert_to.rdkit() names = u.atoms.names - rd_names = np.array( - [a.GetProp("_MDAnalysis_name") for a in mol.GetAtoms()] - ) + rd_names = np.array([a.GetProp("_MDAnalysis_name") + for a in mol.GetAtoms()]) assert (names == rd_names).all() - @pytest.mark.parametrize( - "smi", - [ - r"F/C(Br)=C(Cl)/I", - r"F\C(Br)=C(Cl)\I", - "F-C(Br)=C(Cl)-I", - ], - ) + @pytest.mark.parametrize("smi", [ + r"F/C(Br)=C(Cl)/I", + r"F\C(Br)=C(Cl)\I", + "F-C(Br)=C(Cl)-I", + ]) def test_bond_stereo_not_stereoany(self, smi): u = mda.Universe.from_smiles(smi) mol = u.atoms.convert_to.rdkit(force=True) @@ -881,15 +774,13 @@ def test_bond_stereo_not_stereoany(self, smi): def test_atom_sorter(self): mol = Chem.MolFromSmiles( - "[H]-[C](-[H])-[C](-[H])-[C]-[C]-[H]", sanitize=False - ) + "[H]-[C](-[H])-[C](-[H])-[C]-[C]-[H]", sanitize=False) # corresponding mol: C=C-C#C # atom indices: 1 3 5 6 mol.UpdatePropertyCache() - sorted_atoms = sorted( - [atom for atom in mol.GetAtoms() if atom.GetAtomicNum() > 1], - key=MDAnalysisInferer._atom_sorter, - ) + sorted_atoms = sorted([atom for atom in mol.GetAtoms() + if atom.GetAtomicNum() > 1], + key=MDAnalysisInferer._atom_sorter) sorted_indices = [atom.GetIdx() for atom in sorted_atoms] assert sorted_indices == [6, 5, 1, 3]