Merge pull request #43 from HumanCellAtlas/mb-cell-ranger-func-equiva…

…lence CountMatrix bugfix and new tests
HumanCellAtlas · Jul 25, 2018 · 9bff1c6 · 9bff1c6
2 parents 24ed25f + 57b9369
commit 9bff1c6
Show file tree

Hide file tree

Showing 21 changed files with 1,126 additions and 300 deletions.
diff --git a/src/sctools/__init__.py b/src/sctools/__init__.py
@@ -7,6 +7,7 @@
 from . import reader
 from . import metrics
 from . import platform
+from . import consts
 from pkg_resources import get_distribution, DistributionNotFound
 
 

diff --git a/src/sctools/bam.py b/src/sctools/bam.py
@@ -13,30 +13,36 @@
 
 Methods
 -------
-iter_tag_groups                     function to iterate over reads by an arbitrary tag
-iter_cell_barcodes                  wrapper for iter_tag_groups that iterates over cell barcode tags
-iter_genes                          wrapper for iter_tag_groups that iterates over gene tags
-iter_molecules                      wrapper for iter_tag_groups that iterates over molecule tags
+iter_tag_groups                         function to iterate over reads by an arbitrary tag
+iter_cell_barcodes                      wrapper for iter_tag_groups that iterates over cell barcode tags
+iter_genes                              wrapper for iter_tag_groups that iterates over gene tags
+iter_molecules                          wrapper for iter_tag_groups that iterates over molecule tags
 
 Classes
 -------
-SubsetAlignments                    class to extract reads specific to requested chromosome(s)
-Tagger                              class to add tags to sam/bam records from paired fastq records
+SubsetAlignments                        class to extract reads specific to requested chromosome(s)
+Tagger                                  class to add tags to sam/bam records from paired fastq records
+AlignmentSortOrder                      abstract class to represent alignment sort orders
+QueryNameSortOrder                      alignment sort order by query name
+CellMoleculeGeneQueryNameSortOrder      alignment sort order hierarchically cell > molecule > gene > query name
 
 References
 ----------
 htslib : https://github.com/samtools/htslib
 
 """
 
-import warnings
-import os
 import math
+import os
+import warnings
+from abc import abstractmethod
 from itertools import cycle
-from typing import Iterator, Generator, List, Union, Tuple
+from typing import Iterator, Generator, List, Dict, Union, Tuple, Callable, Any, Optional
 
 import pysam
 
+from . import consts
+
 
 class SubsetAlignments:
     """Wrapper for pysam/htslib that extracts reads corresponding to requested chromosome(s)
@@ -74,9 +80,8 @@ def __init__(self, alignment_file: str, open_mode: str=None):
                 open_mode = 'r'
             else:
                 raise ValueError(
-                    'could not autodetect file type for alignment_file %s (detectible suffixes: '
-                    '.sam, .bam)' % alignment_file
-                )
+                    f'Could not autodetect file type for alignment_file {alignment_file} (detectable suffixes: '
+                    f'.sam, .bam)')
         self._file: str = alignment_file
         self._open_mode: str = open_mode
 
@@ -164,7 +169,7 @@ class Tagger:
 
     def __init__(self, bam_file: str) -> None:
         if not isinstance(bam_file, str):
-            raise TypeError('bam_file must be type str, not %s' % type(bam_file))
+            raise TypeError(f'The argument "bam_file" must be of type str, not {type(bam_file)}')
         self.bam_file = bam_file
 
     # todo add type to tag_generators (make sure it doesn't introduce import issues
@@ -193,7 +198,7 @@ def tag(self, output_bam_name: str, tag_generators) -> None:
                 outbam.write(sam_record)
 
 
-def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=True) -> List[str]:
+def split(in_bam: str, out_prefix: str, *tags, approx_mb_per_split=1000, raise_missing=True) -> List[str]:
     """split `in_bam` by tag into files of `approx_mb_per_split`
 
     Parameters
@@ -203,7 +208,7 @@ def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=Tru
     out_prefix : str
         Prefix for all output files; output will be named as prefix_n where n is an integer equal
         to the chunk number.
-    tags : list
+    tags : tuple
         The bam tags to split on. The tags are checked in order, and sorting is done based on the
         first identified tag. Further tags are only checked if the first tag is missing. This is
         useful in cases where sorting is executed over a corrected barcode, but some records only
@@ -231,44 +236,47 @@ def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=Tru
     if len(tags) == 0:
         raise ValueError('At least one tag must be passed')
 
-    def _cleanup(files_to_counts, files_to_names, rm_all=False) -> None:
+    def _cleanup(
+            _files_to_counts: Dict[pysam.AlignmentFile, int], _files_to_names: Dict[pysam.AlignmentFile, str],
+            rm_all: bool=False) -> None:
         """Closes file handles and remove any empty files.
 
         Parameters
         ----------
-        files_to_counts : dict
+        _files_to_counts : dict
             Dictionary of file objects to the number of reads each contains.
-        files_to_names : dict
+        _files_to_names : dict
             Dictionary of file objects to file handles.
         rm_all : bool, optional
             If True, indicates all files should be removed, regardless of count number, else only
             empty files without counts are removed (default = False)
 
         """
-        for bamfile, count in files_to_counts.items():
+        for bamfile, count in _files_to_counts.items():
             # corner case: clean up files that were created, but didn't get data because
             # n_cell < n_barcode
             bamfile.close()
             if count == 0 or rm_all:
-                os.remove(files_to_names[bamfile])
-                del files_to_names[bamfile]
+                os.remove(_files_to_names[bamfile])
+                del _files_to_names[bamfile]
 
     # find correct number of subfiles to spawn
     bam_mb = os.path.getsize(in_bam) * 1e-6
     n_subfiles = int(math.ceil(bam_mb / approx_mb_per_split))
-    if n_subfiles > 500:
-        warnings.warn('Number of requested subfiles (%d) exceeds 500; this may cause OS errors by '
-                      'exceeding fid limits' % n_subfiles)
-    if n_subfiles > 1000:
-        raise ValueError('Number of requested subfiles (%d) exceeds 1000; this will usually cause '
-                         'OS errors, think about increasing max_mb_per_split.' % n_subfiles)
+    if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN:
+        warnings.warn(f'Number of requested subfiles ({n_subfiles}) exceeds '
+                      f'{consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN}; this may cause OS errors by exceeding fid limits')
+    if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE:
+        raise ValueError(f'Number of requested subfiles ({n_subfiles}) exceeds '
+                         f'{consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE}; this will usually cause OS errors, '
+                         f'think about increasing max_mb_per_split.')
 
     # create all the output files
     with pysam.AlignmentFile(in_bam, 'rb', check_sq=False) as input_alignments:
 
         # map files to counts
-        files_to_counts = {}
-        files_to_names = {}
+        files_to_counts: Dict[pysam.AlignmentFile, int] = {}
+        files_to_names: Dict[pysam.AlignmentFile, str] = {}
         for i in range(n_subfiles):
             out_bam_name = out_prefix + '_%d.bam' % i
             open_bam = pysam.AlignmentFile(out_bam_name, 'wb', template=input_alignments)
@@ -297,8 +305,7 @@ def _cleanup(files_to_counts, files_to_names, rm_all=False) -> None:
             if tag_content is None:
                 if raise_missing:
                     _cleanup(files_to_counts, files_to_names, rm_all=True)
-                    raise RuntimeError(
-                        'alignment encountered that is missing %s tag(s).' % repr(tags))
+                    raise RuntimeError('Alignment encountered that is missing {repr(tags)} tag(s).')
                 else:
                     continue  # move on to next alignment
 
@@ -379,12 +386,12 @@ def iter_molecule_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Gene
     Yields
     ------
     grouped_by_tag : Iterator[pysam.AlignedSegment]
-        reads sharing a unique molecule barcode (``UB`` tag)
+        reads sharing a unique molecule barcode tag
     current_tag : str
         the molecule barcode that records in the group all share
 
     """
-    return iter_tag_groups(tag='UB', bam_iterator=bam_iterator)
+    return iter_tag_groups(tag=consts.MOLECULE_BARCODE_TAG_KEY, bam_iterator=bam_iterator)
 
 
 def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator:
@@ -398,12 +405,12 @@ def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generato
     Yields
     ------
     grouped_by_tag : Iterator[pysam.AlignedSegment]
-        reads sharing a unique cell barcode (``CB`` tag)
+        reads sharing a unique cell barcode tag
     current_tag : str
         the cell barcode that reads in the group all share
 
     """
-    return iter_tag_groups(tag='CB', bam_iterator=bam_iterator)
+    return iter_tag_groups(tag=consts.CELL_BARCODE_TAG_KEY, bam_iterator=bam_iterator)
 
 
 def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator:
@@ -417,9 +424,69 @@ def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator:
     Yields
     ------
     grouped_by_tag : Iterator[pysam.AlignedSegment]
-        reads sharing a unique gene id (``GE`` tag)
+        reads sharing a unique gene name tag
     current_tag : str
         the gene id that reads in the group all share
 
     """
-    return iter_tag_groups(tag='GE', bam_iterator=bam_iterator)
+    return iter_tag_groups(tag=consts.GENE_NAME_TAG_KEY, bam_iterator=bam_iterator)
+
+
+def get_tag_or_default(alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None) -> Optional[str]:
+    """Extracts the value associated to `tag_key` from `alignment`, and returns a default value
+    if the tag is not present."""
+    try:
+        return alignment.get_tag(tag_key)
+    except KeyError:
+        return default
+
+
+class AlignmentSortOrder:
+    """The base class of alignment sort orders."""
+    @property
+    @abstractmethod
+    def key_generator(self) -> Callable[[pysam.AlignedSegment], Any]:
+        """Returns a callable function that calculates a sort key from given pysam.AlignedSegment."""
+        raise NotImplementedError
+
+
+class QueryNameSortOrder(AlignmentSortOrder):
+    """Alignment record sort order by query name."""
+    @staticmethod
+    def get_sort_key(alignment: pysam.AlignedSegment) -> str:
+        return alignment.query_name
+
+    @property
+    def key_generator(self):
+        return QueryNameSortOrder.get_sort_key
+
+    def __repr__(self) -> str:
+        return 'query_name'
+
+
+class CellMoleculeGeneQueryNameSortOrder(AlignmentSortOrder):
+    """Hierarchical alignment record sort order (cell barcode >= molecule barcode >= gene name >= query name)."""
+    def __init__(
+            self,
+            cell_barcode_tag_key: str = consts.CELL_BARCODE_TAG_KEY,
+            molecule_barcode_tag_key: str = consts.MOLECULE_BARCODE_TAG_KEY,
+            gene_name_tag_key: str = consts.GENE_NAME_TAG_KEY) -> None:
+        assert cell_barcode_tag_key, "Cell barcode tag key can not be None"
+        assert molecule_barcode_tag_key, "Molecule barcode tag key can not be None"
+        assert gene_name_tag_key, "Gene name tag key can not be None"
+        self.cell_barcode_tag_key = cell_barcode_tag_key
+        self.molecule_barcode_tag_key = molecule_barcode_tag_key
+        self.gene_name_tag_key = gene_name_tag_key
+
+    def _get_sort_key(self, alignment: pysam.AlignedSegment) -> Tuple[str, str, str, str]:
+        return (get_tag_or_default(alignment, self.cell_barcode_tag_key, default='N'),
+                get_tag_or_default(alignment, self.molecule_barcode_tag_key, default='N'),
+                get_tag_or_default(alignment, self.gene_name_tag_key, default='N'),
+                alignment.query_name)
+
+    @property
+    def key_generator(self) -> Callable[[pysam.AlignedSegment], Tuple[str, str, str, str]]:
+        return self._get_sort_key
+
+    def __repr__(self) -> str:
+        return 'hierarchical__cell_molecule_gene_query_name'
diff --git a/src/sctools/barcode.py b/src/sctools/barcode.py
@@ -22,6 +22,7 @@
 import numpy as np
 import pysam
 
+from . import consts
 from .encodings import TwoBit
 from .stats import base4_entropy
 
@@ -47,11 +48,11 @@ class can optionally be constructed from an iterable where barcodes can be prese
     """
     def __init__(self, barcodes: Mapping[str, int], barcode_length: int):
         if not isinstance(barcodes, Mapping):
-            raise TypeError('barcode set must be a dict-like object mapping barcodes to counts')
+            raise TypeError('The argument "barcodes" must be a dict-like object mapping barcodes to counts')
         self._mapping: Mapping[str, int] = barcodes
 
         if not isinstance(barcode_length, int) and barcode_length > 0:
-            raise ValueError('barcode length must be a positive integer')
+            raise ValueError('The argument "barcode_length" must be a positive integer')
         self._barcode_length: int = barcode_length
 
     def __contains__(self, item) -> bool:
@@ -257,10 +258,9 @@ class ErrorsToCorrectBarcodesMap:
     """
 
     def __init__(self, errors_to_barcodes: Mapping[str, str]):
-
         if not isinstance(errors_to_barcodes, Mapping):
-            raise TypeError('errors_to_barcodes must be a mapping of erroneous barcodes to correct '
-                            'barcodes, not %a' % type(errors_to_barcodes))
+            raise TypeError(f'The argument "errors_to_barcodes" must be a mapping of erroneous barcodes to correct '
+                            f'barcodes, not {type(errors_to_barcodes)}')
         self._map = errors_to_barcodes
 
     def get_corrected_barcode(self, barcode: str) -> str:
@@ -349,6 +349,6 @@ def correct_bam(self, bam_file: str, output_bam_file: str) -> None:
                 try:
                     tag = self.get_corrected_barcode(alignment.get_tag('CR'))
                 except KeyError:  # pass through the uncorrected barcode.
-                    tag = alignment.get_tag('CR')
-                alignment.set_tag(tag='CB', value=tag, value_type='Z')
+                    tag = alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY)
+                alignment.set_tag(tag=consts.CELL_BARCODE_TAG_KEY, value=tag, value_type='Z')
                 fout.write(alignment)
diff --git a/src/sctools/consts.py b/src/sctools/consts.py
@@ -0,0 +1,36 @@
+"""
+Global constants
+================
+
+.. currentmodule:: sctools
+
+This module contains global constants, such as various barcoded BAM tags, and sctools-specific
+constants.
+"""
+
+# BAM tag constants
+
+RAW_SAMPLE_BARCODE_TAG_KEY = 'SR'
+QUALITY_SAMPLE_BARCODE_TAG_KEY = 'SY'
+
+MOLECULE_BARCODE_TAG_KEY = 'UB'
+RAW_MOLECULE_BARCODE_TAG_KEY = 'UR'
+QUALITY_MOLECULE_BARCODE_TAG_KEY = 'UY'
+
+CELL_BARCODE_TAG_KEY = 'CB'
+RAW_CELL_BARCODE_TAG_KEY = 'CR'
+QUALITY_CELL_BARCODE_TAG_KEY = 'CY'
+
+GENE_NAME_TAG_KEY = 'GE'
+NUMBER_OF_HITS_TAG_KEY = 'NH'
+
+ALIGNMENT_LOCATION_TAG_KEY = 'XF'
+INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = 'INTRONIC'
+CODING_ALIGNMENT_LOCATION_TAG_VALUE = 'CODING'
+UTR_ALIGNMENT_LOCATION_TAG_VALUE = 'UTR'
+INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = 'INTERGENIC'
+
+# bam.py constants
+
+MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500
+MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000