Faster name selections (#2755)

* fix #2751 * Python 2.7/3 backport of PR #2755 (use six.str_types, update CHANGELOG and versionchanged as 1.0.1) * modified AtomNames topologyattr to include lookup table index * rework atom name selection to use lookup tables * fixed test supplying integer as atom name * Update test_topologyattrs.py * use dict-lookup string attrs EVERYWHERERE * made protein selection faster, 48ms -> 0.5ms on GRO testfile * improved nucleic/backbone selections * Added explicit tests for Resnames topologyattr tests now provide str types for resnames/icodes * use fnmatchcase to be case sensitive (this was a small unreported bug in 1.0.0: the matching was done case-insensitive) Co-authored-by: Irfan Alibay <IAlibay@users.noreply.github.com> Co-authored-by: Oliver Beckstein <orbeckst@gmail.com> (cherry picked from commit 45e56e8)
MDAnalysis · Aug 26, 2020 · 8a29e83 · 8a29e83
1 parent 9100720
commit 8a29e83
Show file tree

Hide file tree

Showing 6 changed files with 396 additions and 98 deletions.
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -38,12 +38,14 @@ Fixes
     empty AtomGroup (Issue #2848)
   * Fixed reading in masses and charges from a hoomdxml file
     (Issue #2888, PR #2889)
+  * Fixed performance regression on select_atoms for string selections (#2751)
   * Fixed the DMSParser, allowing the creation of multiple segids sharing
     residues with identical resids (Issue #1387, PR #2872)
 
 Enhancements
   * Improved performances when parsing TPR files (PR #2804)
 
+
 Deprecations
   * waterdynamics.HydrogenBondLifetimes will be removed in 2.0.0 and
     replaced with hydrogenbonds.HydrogenBondAnalysis.lifetime() (#2547)

diff --git a/package/MDAnalysis/core/selection.py b/package/MDAnalysis/core/selection.py
@@ -301,7 +301,7 @@ def __init__(self, parser, tokens):
         self.inRadius = float(tokens.popleft())
         self.exRadius = float(tokens.popleft())
         self.sel = parser.parse_expression(self.precedence)
-    
+
     @return_empty_on_apply
     def apply(self, group):
         indices = []
@@ -520,7 +520,7 @@ def apply(self, group):
         return group[mask]
 
 
-class StringSelection(Selection):
+class _ProtoStringSelection(Selection):
     """Selections based on text attributes
 
     .. versionchanged:: 1.0.0
@@ -535,11 +535,23 @@ def __init__(self, parser, tokens):
 
     @return_empty_on_apply
     def apply(self, group):
-        mask = np.zeros(len(group), dtype=np.bool)
-        for val in self.values:
-            values = getattr(group, self.field)
-            mask |= [fnmatch.fnmatch(x, val) for x in values]
-        return group[mask].unique
+        # rather than work on group.names, cheat and look at the lookup table
+        nmattr = getattr(group.universe._topology, self.field)
+
+        matches = []  # list of passing indices
+        # iterate through set of known atom names, check which pass
+        for nm, ix in nmattr.namedict.items():
+            if any(fnmatch.fnmatchcase(nm, val) for val in self.values):
+                matches.append(ix)
+
+        # atomname indices for members of this group
+        nmidx = nmattr.nmidx[getattr(group, self.level)]
+
+        return group[np.in1d(nmidx, matches)].unique
+
+
+class StringSelection(_ProtoStringSelection):
+    level = 'ix'  # operates on atom level attribute, i.e. '.ix'
 
 
 class AtomNameSelection(StringSelection):
@@ -566,22 +578,27 @@ class AtomICodeSelection(StringSelection):
     field = 'icodes'
 
 
-class ResidueNameSelection(StringSelection):
+class _ResidueStringSelection(_ProtoStringSelection):
+    level= 'resindices'
+
+
+class ResidueNameSelection(_ResidueStringSelection):
     """Select atoms based on 'resnames' attribute"""
     token = 'resname'
     field = 'resnames'
 
 
-class MoleculeTypeSelection(StringSelection):
+class MoleculeTypeSelection(_ResidueStringSelection):
     """Select atoms based on 'moltypes' attribute"""
     token = 'moltype'
     field = 'moltypes'
 
 
-class SegmentNameSelection(StringSelection):
+class SegmentNameSelection(_ProtoStringSelection):
     """Select atoms based on 'segids' attribute"""
     token = 'segid'
     field = 'segids'
+    level = 'segindices'
 
 
 class AltlocSelection(StringSelection):
@@ -791,10 +808,15 @@ class ProteinSelection(Selection):
     See Also
     --------
     :func:`MDAnalysis.lib.util.convert_aa_code`
+
+
+    .. versionchanged:: 1.0.1
+       prot_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'protein'
 
-    prot_res = np.array([
+    prot_res = {
         # CHARMM top_all27_prot_lipid.rtf
         'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HSD',
         'HSE', 'HSP', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR',
@@ -817,14 +839,20 @@ class ProteinSelection(Selection):
         'CLEU', 'CILE', 'CVAL', 'CASF', 'CASN', 'CGLN', 'CARG', 'CHID', 'CHIE',
         'CHIP', 'CTRP', 'CPHE', 'CTYR', 'CGLU', 'CASP', 'CLYS', 'CPRO', 'CCYS',
         'CCYX', 'CMET', 'CME', 'ASF',
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        resname_attr = group.universe._topology.resnames
+        # which values in resname attr are in prot_res?
+        matches = [ix for (nm, ix) in resname_attr.namedict.items()
+                   if nm in self.prot_res]
+        # index of each atom's resname
+        nmidx = resname_attr.nmidx[group.resindices]
+        # intersect atom's resname index and matches to prot_res
+        return group[np.in1d(nmidx, matches)].unique
 
 
 class NucleicSelection(Selection):
@@ -839,23 +867,32 @@ class NucleicSelection(Selection):
 
     .. versionchanged:: 0.8
        additional Gromacs selections
+    .. versionchanged:: 1.0.1
+       nucl_res changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleic'
 
-    nucl_res = np.array([
+    nucl_res = {
         'ADE', 'URA', 'CYT', 'GUA', 'THY', 'DA', 'DC', 'DG', 'DT', 'RA',
         'RU', 'RG', 'RC', 'A', 'T', 'U', 'C', 'G',
         'DA5', 'DC5', 'DG5', 'DT5',
         'DA3', 'DC3', 'DG3', 'DT3',
         'RA5', 'RU5', 'RG5', 'RC5',
         'RA3', 'RU3', 'RG3', 'RC3'
-    ])
+    }
 
     def __init__(self, parser, tokens):
         pass
 
     def apply(self, group):
-        mask = np.in1d(group.resnames, self.nucl_res)
+        resnames = group.universe._topology.resnames
+        nmidx = resnames.nmidx[group.resindices]
+
+        matches = [ix for (nm, ix) in resnames.namedict.items()
+                   if nm in self.nucl_res]
+        mask = np.in1d(nmidx, matches)
+
         return group[mask].unique
 
 
@@ -864,29 +901,65 @@ class BackboneSelection(ProteinSelection):
 
     This excludes OT* on C-termini
     (which are included by, eg VMD's backbone selection).
+
+
+    .. versionchanged:: 1.0.1
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'backbone'
-    bb_atoms = np.array(['N', 'CA', 'C', 'O'])
+    bb_atoms = {'N', 'CA', 'C', 'O'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.prot_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.prot_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicBackboneSelection(NucleicSelection):
     """Contains all atoms with name "P", "C5'", C3'", "O3'", "O5'".
 
     These atoms are only recognized if they are in a residue matched
     by the :class:`NucleicSelection`.
+
+
+    .. versionchanged:: 1.0.1
+       bb_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbackbone'
-    bb_atoms = np.array(["P", "C5'", "C3'", "O3'", "O5'"])
+    bb_atoms = {"P", "C5'", "C3'", "O3'", "O5'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.bb_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.bb_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class BaseSelection(NucleicSelection):
@@ -896,29 +969,65 @@ class BaseSelection(NucleicSelection):
 
      'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
      'O6','N2','N6', 'O2','N4','O4','C5M'
+
+
+    .. versionchanged:: 1.0.1
+       base_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicbase'
-    base_atoms = np.array([
+    base_atoms = {
         'N9', 'N7', 'C8', 'C5', 'C4', 'N3', 'C2', 'N1', 'C6',
         'O6', 'N2', 'N6',
-        'O2', 'N4', 'O4', 'C5M'])
+        'O2', 'N4', 'O4', 'C5M'}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.base_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.base_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class NucleicSugarSelection(NucleicSelection):
     """Contains all atoms with name C1', C2', C3', C4', O2', O4', O3'.
+
+
+    .. versionchanged:: 1.0.1
+       sug_atoms changed to set (from numpy array)
+       performance improved by ~100x on larger systems
     """
     token = 'nucleicsugar'
-    sug_atoms = np.array(["C1'", "C2'", "C3'", "C4'", "O4'"])
+    sug_atoms = {"C1'", "C2'", "C3'", "C4'", "O4'"}
 
     def apply(self, group):
-        mask = np.in1d(group.names, self.sug_atoms)
-        mask &= np.in1d(group.resnames, self.nucl_res)
-        return group[mask].unique
+        atomnames = group.universe._topology.names
+        resnames = group.universe._topology.resnames
+
+        # filter by atom names
+        name_matches = [ix for (nm, ix) in atomnames.namedict.items()
+                        if nm in self.sug_atoms]
+        nmidx = atomnames.nmidx[group.ix]
+        group = group[np.in1d(nmidx, name_matches)]
+
+        # filter by resnames
+        resname_matches = [ix for (nm, ix) in resnames.namedict.items()
+                           if nm in self.nucl_res]
+        nmidx = resnames.nmidx[group.resindices]
+        group = group[np.in1d(nmidx, resname_matches)]
+
+        return group.unique
 
 
 class PropertySelection(Selection):
@@ -1035,7 +1144,7 @@ class SameSelection(Selection):
     Selects all atoms that have the same subkeyword value as any atom in selection
 
     .. versionchanged:: 1.0.0
-       Map :code:`"residue"` to :code:`"resindices"` and :code:`"segment"` to 
+       Map :code:`"residue"` to :code:`"resindices"` and :code:`"segment"` to
        :code:`"segindices"` (see #2669 and #2672)
     """