Added a unit test that reproduces the expected count matrix for snRNA…

…-Seq mode in CreateCountMatrix (#71) * intial commit * snRNA-Seq unit test in pytest working * fixed the pytest for snRNA functionallity * corrected the chromosome-gene-exon from gene_locations * Update count.py * addressed the review comments and formatted code * Update count.py Co-authored-by: Nikolas Barkas <nikolas.barkas@outlook.com>
HumanCellAtlas · Feb 26, 2020 · a3ec39d · a3ec39d
1 parent a4bd5e6
commit a3ec39d
Show file tree

Hide file tree

Showing 10 changed files with 756 additions and 146 deletions.
diff --git a/src/sctools/consts.py b/src/sctools/consts.py
@@ -34,3 +34,8 @@
 
 MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500
 MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000
+
+
+# modes of the count matrix runs
+SINGLE_CELL_COUNT_MATRIX = 0
+SINGLE_NUCLEI_COUNT_MATRIX = 1
diff --git a/src/sctools/count.py b/src/sctools/count.py
@@ -135,7 +135,7 @@ def from_sorted_tagged_bam(
         cls,
         bam_file: str,
         gene_name_to_index: Dict[str, int],
-        gene_locations: Dict[str, List[tuple]] = None,
+        chromosomes_gene_locations_extended: Dict[str, List[tuple]] = None,
         cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY,
         molecule_barcode_tag: str = consts.MOLECULE_BARCODE_TAG_KEY,
         gene_name_tag: str = consts.GENE_NAME_TAG_KEY,
@@ -172,7 +172,7 @@ def from_sorted_tagged_bam(
         bam_file : str
             input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that
             order
-        gene_locations : dict
+        chromosomes_gene_locations_extended : dict
             Location of genes by chromosome
             (default = None)
         cell_barcode_tag : str, optional
@@ -251,22 +251,31 @@ def from_sorted_tagged_bam(
         ) in grouped_records_generator:
 
             # modify alignments to include the gene name to the alignments to INTRONIC regions
-            if gene_locations:
+            if chromosomes_gene_locations_extended:
                 alignments = []
                 for alignment in input_alignments:
                     if alignment.has_tag("XF"):
                         aln_type = alignment.get_tag("XF")
                         if (
                             alignment.reference_name
                             and aln_type == "INTRONIC"
-                            and alignment.reference_name in gene_locations
+                            and alignment.reference_name
+                            in chromosomes_gene_locations_extended
                         ):
                             gene_name = cls.binary_overlap(
-                                gene_locations[alignment.reference_name],
+                                chromosomes_gene_locations_extended[
+                                    alignment.reference_name
+                                ],
                                 0,
-                                len(gene_locations[alignment.reference_name]) - 1,
+                                len(
+                                    chromosomes_gene_locations_extended[
+                                        alignment.reference_name
+                                    ]
+                                )
+                                - 1,
                                 alignment.reference_start,
                             )
+
                             if gene_name:
                                 alignment.set_tag("GE", gene_name)
                     alignments.append(alignment)

diff --git a/src/sctools/gtf.py b/src/sctools/gtf.py
@@ -350,3 +350,56 @@ def extract_extended_gene_names(
         # Sort by starting location
         gene_locations[chromosome].sort(key=lambda x: x[0])
     return gene_locations
+
+
+def extract_gene_exons(
+    files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#"
+) -> Dict[str, List[tuple]]:
+    """Extract extended gene names from GTF file(s) and returns a map from gene names to the the
+    list of exons in the ascending order of the start positions file(s).
+
+    Parameters
+    ----------
+    files : Union[str, List], optional
+        File(s) to read. If '-', read sys.stdin (default = '-')
+    mode : {'r', 'rb'}, optional
+        Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r').
+    header_comment_char : str, optional
+        lines beginning with this character are skipped (default = '#')
+
+    Returns
+    -------
+    Dict[str, List[tuple]]
+        A dictionary of chromosome names mapping to a List of tuples, each containing
+        a the exons in the ascending order of the start positions.
+        Dict[str, List(Tuple((start,end), gene)))
+    """
+    gene_name_to_start_end = dict()
+    for record in Reader(files, mode, header_comment_char).filter(
+        retain_types=["exon"]
+    ):
+        gene_name = record.get_attribute("gene_name")
+        if gene_name is None:
+            raise ValueError(
+                f"Malformed GTF file detected. Record is of type gene but does not have a "
+                f'"gene_name" field: {record}'
+            )
+        if record.chromosome not in gene_name_to_start_end:
+            gene_name_to_start_end[record.chromosome] = dict()
+
+        if gene_name not in gene_name_to_start_end[record.chromosome]:
+            gene_name_to_start_end[record.chromosome][gene_name] = []
+
+        gene_name_to_start_end[record.chromosome][gene_name].append(
+            (record.start, record.end)
+        )
+
+    gene_locations_exons = dict()
+    # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start
+    for chromosome in gene_name_to_start_end:
+        gene_locations_exons[chromosome] = [
+            (locs, key) for key, locs in gene_name_to_start_end[chromosome].items()
+        ]
+        # Sort by starting location
+        gene_locations_exons[chromosome].sort(key=lambda x: x[0])
+    return gene_locations_exons
diff --git a/src/sctools/platform.py b/src/sctools/platform.py
@@ -447,7 +447,7 @@ def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int:
         matrix = count.CountMatrix.from_sorted_tagged_bam(
             bam_file=args.bam_file,
             gene_name_to_index=gene_name_to_index,
-            gene_locations=gene_locations,
+            chromosomes_gene_locations_extended=gene_locations,
             cell_barcode_tag=args.cell_barcode_tag,
             molecule_barcode_tag=args.molecule_barcode_tag,
             gene_name_tag=args.gene_name_tag,

diff --git a/src/sctools/test/test_barcode.py b/src/sctools/test/test_barcode.py
@@ -144,8 +144,7 @@ def tagged_bamfile():
         outbam,
     ]
     platform.TenXV2.attach_barcodes(args)
-    yield outbam
-    os.remove(outbam)
+    return outbam
 
 
 def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x):