Skip to content

Commit

Permalink
Merge pull request #75 from HumanCellAtlas/kmk-snrna-seq-multi-gene
Browse files Browse the repository at this point in the history
Kmk snrna seq multi gene
  • Loading branch information
Fab-T committed Mar 24, 2020
2 parents aaed0b9 + 109da40 commit dc07854
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 46 deletions.
1 change: 0 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
#files ignored when building docker image
*/*/test
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ wheels/
*.egg-info/
.installed.cfg
*.egg
test/data/bam_with_tags_test.bam

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
8 changes: 7 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ LABEL maintainer="Ambrose J. Carr <acarr@broadinstitute.org>" \
COPY requirements.txt .
RUN pip3 install -r requirements.txt

RUN mkdir /sctools/

COPY . /sctools

RUN pip3 install /sctools

WORKDIR usr/local/bin/sctools

COPY src/sctools .

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ requests==2.20.0
scipy==1.3.1
setuptools==40.4.3
setuptools_scm==3.1.0
tables==3.4.2
tables==3.4.2
50 changes: 18 additions & 32 deletions src/sctools/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb')
from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str)
Notes
-----
Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus.
Expand Down Expand Up @@ -251,44 +252,20 @@ def from_sorted_tagged_bam(
) in grouped_records_generator:

# modify alignments to include the gene name to the alignments to INTRONIC regions
if chromosomes_gene_locations_extended:
alignments = []
for alignment in input_alignments:
if alignment.has_tag("XF"):
aln_type = alignment.get_tag("XF")
if (
alignment.reference_name
and aln_type == "INTRONIC"
and alignment.reference_name
in chromosomes_gene_locations_extended
):
gene_name = cls.binary_overlap(
chromosomes_gene_locations_extended[
alignment.reference_name
],
0,
len(
chromosomes_gene_locations_extended[
alignment.reference_name
]
)
- 1,
alignment.reference_start,
)

if gene_name:
alignment.set_tag("GE", gene_name)
alignments.append(alignment)
else:
alignments = input_alignments
alignments = input_alignments

# only keep queries w/ well-formed UMIs
gene_name = None
if cell_barcode is None or molecule_barcode is None:
continue

if len(alignments) == 1:
primary_alignment = alignments[0]
if primary_alignment.has_tag(gene_name_tag):
if (
primary_alignment.has_tag(gene_name_tag)
and primary_alignment.has_tag('XF')
and primary_alignment.get_tag('XF') != 'INTERGENIC'
):
gene_name = primary_alignment.get_tag(gene_name_tag)
# overlaps multiple genes, drop query, and unfortunately there only one
# one alignment for this query
Expand All @@ -299,15 +276,24 @@ def from_sorted_tagged_bam(
else: # multi-map
implicated_gene_names: Set[str] = set()
for alignment in alignments:
if alignment.has_tag(gene_name_tag):
if (
alignment.has_tag(gene_name_tag)
and alignment.has_tag('XF')
and alignment.get_tag('XF') != 'INTERGENIC'
):
# consider its gene name only if it has only gene name
gene_name = alignment.get_tag(gene_name_tag)
if len(gene_name.split(',')) == 1:
implicated_gene_names.add(alignment.get_tag(gene_name_tag))

if len(implicated_gene_names) == 1: # only one gene
gene_name = implicated_gene_names.__iter__().__next__()
else:
continue # drop query

if gene_name is None:
continue

if (
cell_barcode,
molecule_barcode,
Expand Down
4 changes: 4 additions & 0 deletions src/sctools/metrics/gatherer.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ def extract_metrics(self, mode: str = 'rb') -> None:
for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator):
metric_aggregator = GeneMetrics()

# in case of multi-genes ignore as in the counting stage
if gene_tag and len(gene_tag.split(',')) > 1:
continue

# break up gene ids by cell barcodes
for cell_iterator, cell_tag in iter_cell_barcodes(
bam_iterator=gene_iterator
Expand Down
11 changes: 0 additions & 11 deletions src/sctools/test/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,17 +894,6 @@ def extract_gene_non_exons(
[bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()],
ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"],
)
def test_count_matrix_with_introns(
alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index
):
_count_matrix_with_introns(
alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX
)
_count_matrix_with_introns(
alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX
)


def _count_matrix_with_introns(
alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index
):
Expand Down

0 comments on commit dc07854

Please sign in to comment.