Merge pull request #75 from HumanCellAtlas/kmk-snrna-seq-multi-gene

Kmk snrna seq multi gene
HumanCellAtlas · Mar 24, 2020 · dc07854 · dc07854
2 parents aaed0b9 + 109da40
commit dc07854
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 46 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,2 +1 @@
 #files ignored when building docker image
-*/*/test
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,7 @@ wheels/
 *.egg-info/
 .installed.cfg
 *.egg
+test/data/bam_with_tags_test.bam
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

diff --git a/Dockerfile b/Dockerfile
@@ -7,6 +7,12 @@ LABEL maintainer="Ambrose J. Carr <acarr@broadinstitute.org>" \
 COPY requirements.txt .
 RUN pip3 install -r requirements.txt
 
+RUN mkdir /sctools/
+
+COPY .  /sctools 
+
+RUN  pip3 install /sctools
+
 WORKDIR usr/local/bin/sctools
 
-COPY src/sctools .
+
diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,4 @@ requests==2.20.0
 scipy==1.3.1
 setuptools==40.4.3
 setuptools_scm==3.1.0
-tables==3.4.2
+tables==3.4.2
diff --git a/src/sctools/count.py b/src/sctools/count.py
@@ -14,6 +14,7 @@
     gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb')
 from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str)
 
+
 Notes
 -----
 Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus.
@@ -251,44 +252,20 @@ def from_sorted_tagged_bam(
         ) in grouped_records_generator:
 
             # modify alignments to include the gene name to the alignments to INTRONIC regions
-            if chromosomes_gene_locations_extended:
-                alignments = []
-                for alignment in input_alignments:
-                    if alignment.has_tag("XF"):
-                        aln_type = alignment.get_tag("XF")
-                        if (
-                            alignment.reference_name
-                            and aln_type == "INTRONIC"
-                            and alignment.reference_name
-                            in chromosomes_gene_locations_extended
-                        ):
-                            gene_name = cls.binary_overlap(
-                                chromosomes_gene_locations_extended[
-                                    alignment.reference_name
-                                ],
-                                0,
-                                len(
-                                    chromosomes_gene_locations_extended[
-                                        alignment.reference_name
-                                    ]
-                                )
-                                - 1,
-                                alignment.reference_start,
-                            )
-
-                            if gene_name:
-                                alignment.set_tag("GE", gene_name)
-                    alignments.append(alignment)
-            else:
-                alignments = input_alignments
+            alignments = input_alignments
 
             # only keep queries w/ well-formed UMIs
+            gene_name = None
             if cell_barcode is None or molecule_barcode is None:
                 continue
 
             if len(alignments) == 1:
                 primary_alignment = alignments[0]
-                if primary_alignment.has_tag(gene_name_tag):
+                if (
+                    primary_alignment.has_tag(gene_name_tag)
+                    and primary_alignment.has_tag('XF')
+                    and primary_alignment.get_tag('XF') != 'INTERGENIC'
+                ):
                     gene_name = primary_alignment.get_tag(gene_name_tag)
                     # overlaps multiple genes, drop query, and unfortunately there only one
                     # one alignment for this query
@@ -299,15 +276,24 @@ def from_sorted_tagged_bam(
             else:  # multi-map
                 implicated_gene_names: Set[str] = set()
                 for alignment in alignments:
-                    if alignment.has_tag(gene_name_tag):
+                    if (
+                        alignment.has_tag(gene_name_tag)
+                        and alignment.has_tag('XF')
+                        and alignment.get_tag('XF') != 'INTERGENIC'
+                    ):
                         # consider its gene name only if it has only  gene name
+                        gene_name = alignment.get_tag(gene_name_tag)
                         if len(gene_name.split(',')) == 1:
                             implicated_gene_names.add(alignment.get_tag(gene_name_tag))
+
                 if len(implicated_gene_names) == 1:  # only one gene
                     gene_name = implicated_gene_names.__iter__().__next__()
                 else:
                     continue  # drop query
 
+            if gene_name is None:
+                continue
+
             if (
                 cell_barcode,
                 molecule_barcode,

diff --git a/src/sctools/metrics/gatherer.py b/src/sctools/metrics/gatherer.py
@@ -198,6 +198,10 @@ def extract_metrics(self, mode: str = 'rb') -> None:
             for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator):
                 metric_aggregator = GeneMetrics()
 
+                # in case of multi-genes ignore as in the counting stage
+                if gene_tag and len(gene_tag.split(',')) > 1:
+                    continue
+
                 # break up gene ids by cell barcodes
                 for cell_iterator, cell_tag in iter_cell_barcodes(
                     bam_iterator=gene_iterator

diff --git a/src/sctools/test/test_count.py b/src/sctools/test/test_count.py
@@ -894,17 +894,6 @@ def extract_gene_non_exons(
     [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()],
     ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"],
 )
-def test_count_matrix_with_introns(
-    alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index
-):
-    _count_matrix_with_introns(
-        alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX
-    )
-    _count_matrix_with_introns(
-        alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX
-    )
-
-
 def _count_matrix_with_introns(
     alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index
 ):