Skip to content

Commit

Permalink
Merge pull request #43 from HumanCellAtlas/mb-cell-ranger-func-equiva…
Browse files Browse the repository at this point in the history
…lence

CountMatrix bugfix and new tests
  • Loading branch information
mbabadi committed Jul 25, 2018
2 parents 24ed25f + 57b9369 commit 9bff1c6
Show file tree
Hide file tree
Showing 21 changed files with 1,126 additions and 300 deletions.
1 change: 1 addition & 0 deletions src/sctools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from . import reader
from . import metrics
from . import platform
from . import consts
from pkg_resources import get_distribution, DistributionNotFound


Expand Down
141 changes: 104 additions & 37 deletions src/sctools/bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,36 @@
Methods
-------
iter_tag_groups function to iterate over reads by an arbitrary tag
iter_cell_barcodes wrapper for iter_tag_groups that iterates over cell barcode tags
iter_genes wrapper for iter_tag_groups that iterates over gene tags
iter_molecules wrapper for iter_tag_groups that iterates over molecule tags
iter_tag_groups function to iterate over reads by an arbitrary tag
iter_cell_barcodes wrapper for iter_tag_groups that iterates over cell barcode tags
iter_genes wrapper for iter_tag_groups that iterates over gene tags
iter_molecules wrapper for iter_tag_groups that iterates over molecule tags
Classes
-------
SubsetAlignments class to extract reads specific to requested chromosome(s)
Tagger class to add tags to sam/bam records from paired fastq records
SubsetAlignments class to extract reads specific to requested chromosome(s)
Tagger class to add tags to sam/bam records from paired fastq records
AlignmentSortOrder abstract class to represent alignment sort orders
QueryNameSortOrder alignment sort order by query name
CellMoleculeGeneQueryNameSortOrder alignment sort order hierarchically cell > molecule > gene > query name
References
----------
htslib : https://github.com/samtools/htslib
"""

import warnings
import os
import math
import os
import warnings
from abc import abstractmethod
from itertools import cycle
from typing import Iterator, Generator, List, Union, Tuple
from typing import Iterator, Generator, List, Dict, Union, Tuple, Callable, Any, Optional

import pysam

from . import consts


class SubsetAlignments:
"""Wrapper for pysam/htslib that extracts reads corresponding to requested chromosome(s)
Expand Down Expand Up @@ -74,9 +80,8 @@ def __init__(self, alignment_file: str, open_mode: str=None):
open_mode = 'r'
else:
raise ValueError(
'could not autodetect file type for alignment_file %s (detectible suffixes: '
'.sam, .bam)' % alignment_file
)
f'Could not autodetect file type for alignment_file {alignment_file} (detectable suffixes: '
f'.sam, .bam)')
self._file: str = alignment_file
self._open_mode: str = open_mode

Expand Down Expand Up @@ -164,7 +169,7 @@ class Tagger:

def __init__(self, bam_file: str) -> None:
if not isinstance(bam_file, str):
raise TypeError('bam_file must be type str, not %s' % type(bam_file))
raise TypeError(f'The argument "bam_file" must be of type str, not {type(bam_file)}')
self.bam_file = bam_file

# todo add type to tag_generators (make sure it doesn't introduce import issues
Expand Down Expand Up @@ -193,7 +198,7 @@ def tag(self, output_bam_name: str, tag_generators) -> None:
outbam.write(sam_record)


def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=True) -> List[str]:
def split(in_bam: str, out_prefix: str, *tags, approx_mb_per_split=1000, raise_missing=True) -> List[str]:
"""split `in_bam` by tag into files of `approx_mb_per_split`
Parameters
Expand All @@ -203,7 +208,7 @@ def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=Tru
out_prefix : str
Prefix for all output files; output will be named as prefix_n where n is an integer equal
to the chunk number.
tags : list
tags : tuple
The bam tags to split on. The tags are checked in order, and sorting is done based on the
first identified tag. Further tags are only checked if the first tag is missing. This is
useful in cases where sorting is executed over a corrected barcode, but some records only
Expand Down Expand Up @@ -231,44 +236,47 @@ def split(in_bam, out_prefix, *tags, approx_mb_per_split=1000, raise_missing=Tru
if len(tags) == 0:
raise ValueError('At least one tag must be passed')

def _cleanup(files_to_counts, files_to_names, rm_all=False) -> None:
def _cleanup(
_files_to_counts: Dict[pysam.AlignmentFile, int], _files_to_names: Dict[pysam.AlignmentFile, str],
rm_all: bool=False) -> None:
"""Closes file handles and remove any empty files.
Parameters
----------
files_to_counts : dict
_files_to_counts : dict
Dictionary of file objects to the number of reads each contains.
files_to_names : dict
_files_to_names : dict
Dictionary of file objects to file handles.
rm_all : bool, optional
If True, indicates all files should be removed, regardless of count number, else only
empty files without counts are removed (default = False)
"""
for bamfile, count in files_to_counts.items():
for bamfile, count in _files_to_counts.items():
# corner case: clean up files that were created, but didn't get data because
# n_cell < n_barcode
bamfile.close()
if count == 0 or rm_all:
os.remove(files_to_names[bamfile])
del files_to_names[bamfile]
os.remove(_files_to_names[bamfile])
del _files_to_names[bamfile]

# find correct number of subfiles to spawn
bam_mb = os.path.getsize(in_bam) * 1e-6
n_subfiles = int(math.ceil(bam_mb / approx_mb_per_split))
if n_subfiles > 500:
warnings.warn('Number of requested subfiles (%d) exceeds 500; this may cause OS errors by '
'exceeding fid limits' % n_subfiles)
if n_subfiles > 1000:
raise ValueError('Number of requested subfiles (%d) exceeds 1000; this will usually cause '
'OS errors, think about increasing max_mb_per_split.' % n_subfiles)
if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN:
warnings.warn(f'Number of requested subfiles ({n_subfiles}) exceeds '
f'{consts.MAX_BAM_SPLIT_SUBFILES_TO_WARN}; this may cause OS errors by exceeding fid limits')
if n_subfiles > consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE:
raise ValueError(f'Number of requested subfiles ({n_subfiles}) exceeds '
f'{consts.MAX_BAM_SPLIT_SUBFILES_TO_RAISE}; this will usually cause OS errors, '
f'think about increasing max_mb_per_split.')

# create all the output files
with pysam.AlignmentFile(in_bam, 'rb', check_sq=False) as input_alignments:

# map files to counts
files_to_counts = {}
files_to_names = {}
files_to_counts: Dict[pysam.AlignmentFile, int] = {}
files_to_names: Dict[pysam.AlignmentFile, str] = {}
for i in range(n_subfiles):
out_bam_name = out_prefix + '_%d.bam' % i
open_bam = pysam.AlignmentFile(out_bam_name, 'wb', template=input_alignments)
Expand Down Expand Up @@ -297,8 +305,7 @@ def _cleanup(files_to_counts, files_to_names, rm_all=False) -> None:
if tag_content is None:
if raise_missing:
_cleanup(files_to_counts, files_to_names, rm_all=True)
raise RuntimeError(
'alignment encountered that is missing %s tag(s).' % repr(tags))
raise RuntimeError('Alignment encountered that is missing {repr(tags)} tag(s).')
else:
continue # move on to next alignment

Expand Down Expand Up @@ -379,12 +386,12 @@ def iter_molecule_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Gene
Yields
------
grouped_by_tag : Iterator[pysam.AlignedSegment]
reads sharing a unique molecule barcode (``UB`` tag)
reads sharing a unique molecule barcode tag
current_tag : str
the molecule barcode that records in the group all share
"""
return iter_tag_groups(tag='UB', bam_iterator=bam_iterator)
return iter_tag_groups(tag=consts.MOLECULE_BARCODE_TAG_KEY, bam_iterator=bam_iterator)


def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator:
Expand All @@ -398,12 +405,12 @@ def iter_cell_barcodes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generato
Yields
------
grouped_by_tag : Iterator[pysam.AlignedSegment]
reads sharing a unique cell barcode (``CB`` tag)
reads sharing a unique cell barcode tag
current_tag : str
the cell barcode that reads in the group all share
"""
return iter_tag_groups(tag='CB', bam_iterator=bam_iterator)
return iter_tag_groups(tag=consts.CELL_BARCODE_TAG_KEY, bam_iterator=bam_iterator)


def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator:
Expand All @@ -417,9 +424,69 @@ def iter_genes(bam_iterator: Iterator[pysam.AlignedSegment]) -> Generator:
Yields
------
grouped_by_tag : Iterator[pysam.AlignedSegment]
reads sharing a unique gene id (``GE`` tag)
reads sharing a unique gene name tag
current_tag : str
the gene id that reads in the group all share
"""
return iter_tag_groups(tag='GE', bam_iterator=bam_iterator)
return iter_tag_groups(tag=consts.GENE_NAME_TAG_KEY, bam_iterator=bam_iterator)


def get_tag_or_default(alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None) -> Optional[str]:
"""Extracts the value associated to `tag_key` from `alignment`, and returns a default value
if the tag is not present."""
try:
return alignment.get_tag(tag_key)
except KeyError:
return default


class AlignmentSortOrder:
"""The base class of alignment sort orders."""
@property
@abstractmethod
def key_generator(self) -> Callable[[pysam.AlignedSegment], Any]:
"""Returns a callable function that calculates a sort key from given pysam.AlignedSegment."""
raise NotImplementedError


class QueryNameSortOrder(AlignmentSortOrder):
"""Alignment record sort order by query name."""
@staticmethod
def get_sort_key(alignment: pysam.AlignedSegment) -> str:
return alignment.query_name

@property
def key_generator(self):
return QueryNameSortOrder.get_sort_key

def __repr__(self) -> str:
return 'query_name'


class CellMoleculeGeneQueryNameSortOrder(AlignmentSortOrder):
"""Hierarchical alignment record sort order (cell barcode >= molecule barcode >= gene name >= query name)."""
def __init__(
self,
cell_barcode_tag_key: str = consts.CELL_BARCODE_TAG_KEY,
molecule_barcode_tag_key: str = consts.MOLECULE_BARCODE_TAG_KEY,
gene_name_tag_key: str = consts.GENE_NAME_TAG_KEY) -> None:
assert cell_barcode_tag_key, "Cell barcode tag key can not be None"
assert molecule_barcode_tag_key, "Molecule barcode tag key can not be None"
assert gene_name_tag_key, "Gene name tag key can not be None"
self.cell_barcode_tag_key = cell_barcode_tag_key
self.molecule_barcode_tag_key = molecule_barcode_tag_key
self.gene_name_tag_key = gene_name_tag_key

def _get_sort_key(self, alignment: pysam.AlignedSegment) -> Tuple[str, str, str, str]:
return (get_tag_or_default(alignment, self.cell_barcode_tag_key, default='N'),
get_tag_or_default(alignment, self.molecule_barcode_tag_key, default='N'),
get_tag_or_default(alignment, self.gene_name_tag_key, default='N'),
alignment.query_name)

@property
def key_generator(self) -> Callable[[pysam.AlignedSegment], Tuple[str, str, str, str]]:
return self._get_sort_key

def __repr__(self) -> str:
return 'hierarchical__cell_molecule_gene_query_name'
14 changes: 7 additions & 7 deletions src/sctools/barcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import numpy as np
import pysam

from . import consts
from .encodings import TwoBit
from .stats import base4_entropy

Expand All @@ -47,11 +48,11 @@ class can optionally be constructed from an iterable where barcodes can be prese
"""
def __init__(self, barcodes: Mapping[str, int], barcode_length: int):
if not isinstance(barcodes, Mapping):
raise TypeError('barcode set must be a dict-like object mapping barcodes to counts')
raise TypeError('The argument "barcodes" must be a dict-like object mapping barcodes to counts')
self._mapping: Mapping[str, int] = barcodes

if not isinstance(barcode_length, int) and barcode_length > 0:
raise ValueError('barcode length must be a positive integer')
raise ValueError('The argument "barcode_length" must be a positive integer')
self._barcode_length: int = barcode_length

def __contains__(self, item) -> bool:
Expand Down Expand Up @@ -257,10 +258,9 @@ class ErrorsToCorrectBarcodesMap:
"""

def __init__(self, errors_to_barcodes: Mapping[str, str]):

if not isinstance(errors_to_barcodes, Mapping):
raise TypeError('errors_to_barcodes must be a mapping of erroneous barcodes to correct '
'barcodes, not %a' % type(errors_to_barcodes))
raise TypeError(f'The argument "errors_to_barcodes" must be a mapping of erroneous barcodes to correct '
f'barcodes, not {type(errors_to_barcodes)}')
self._map = errors_to_barcodes

def get_corrected_barcode(self, barcode: str) -> str:
Expand Down Expand Up @@ -349,6 +349,6 @@ def correct_bam(self, bam_file: str, output_bam_file: str) -> None:
try:
tag = self.get_corrected_barcode(alignment.get_tag('CR'))
except KeyError: # pass through the uncorrected barcode.
tag = alignment.get_tag('CR')
alignment.set_tag(tag='CB', value=tag, value_type='Z')
tag = alignment.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY)
alignment.set_tag(tag=consts.CELL_BARCODE_TAG_KEY, value=tag, value_type='Z')
fout.write(alignment)
36 changes: 36 additions & 0 deletions src/sctools/consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Global constants
================
.. currentmodule:: sctools
This module contains global constants, such as various barcoded BAM tags, and sctools-specific
constants.
"""

# BAM tag constants

RAW_SAMPLE_BARCODE_TAG_KEY = 'SR'
QUALITY_SAMPLE_BARCODE_TAG_KEY = 'SY'

MOLECULE_BARCODE_TAG_KEY = 'UB'
RAW_MOLECULE_BARCODE_TAG_KEY = 'UR'
QUALITY_MOLECULE_BARCODE_TAG_KEY = 'UY'

CELL_BARCODE_TAG_KEY = 'CB'
RAW_CELL_BARCODE_TAG_KEY = 'CR'
QUALITY_CELL_BARCODE_TAG_KEY = 'CY'

GENE_NAME_TAG_KEY = 'GE'
NUMBER_OF_HITS_TAG_KEY = 'NH'

ALIGNMENT_LOCATION_TAG_KEY = 'XF'
INTRONIC_ALIGNMENT_LOCATION_TAG_VALUE = 'INTRONIC'
CODING_ALIGNMENT_LOCATION_TAG_VALUE = 'CODING'
UTR_ALIGNMENT_LOCATION_TAG_VALUE = 'UTR'
INTERGENIC_ALIGNMENT_LOCATION_TAG_VALUE = 'INTERGENIC'

# bam.py constants

MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500
MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000

0 comments on commit 9bff1c6

Please sign in to comment.