From a3ec39d33334ab92b5c7907dd5ee9c28fb476249 Mon Sep 17 00:00:00 2001
From: Kishori M Konwar <43380010+kishorikonwar@users.noreply.github.com>
Date: Wed, 26 Feb 2020 15:03:48 -0500
Subject: [PATCH] Added a unit test that reproduces the expected count matrix
 for snRNA-Seq mode in CreateCountMatrix (#71)

* intial commit

* snRNA-Seq unit test in pytest working

* fixed the pytest for snRNA functionallity

* corrected the chromosome-gene-exon from gene_locations

* Update count.py

* addressed the review comments and formatted code

* Update count.py

Co-authored-by: Nikolas Barkas <nikolas.barkas@outlook.com>
---
 src/sctools/consts.py                |   5 +
 src/sctools/count.py                 |  21 +-
 src/sctools/gtf.py                   |  53 +++
 src/sctools/platform.py              |   2 +-
 src/sctools/test/test_barcode.py     |   3 +-
 src/sctools/test/test_count.py       | 552 ++++++++++++++++++++++++++-
 src/sctools/test/test_encodings.py   |  26 +-
 src/sctools/test/test_entrypoints.py | 170 ++++-----
 src/sctools/test/test_gtf.py         |  48 +--
 src/sctools/test/test_platform.py    |  22 +-
 10 files changed, 756 insertions(+), 146 deletions(-)

diff --git a/src/sctools/consts.py b/src/sctools/consts.py
index 8c2c27b..9a57e08 100644
--- a/src/sctools/consts.py
+++ b/src/sctools/consts.py
@@ -34,3 +34,8 @@
 
 MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500
 MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000
+
+
+# modes of the count matrix runs
+SINGLE_CELL_COUNT_MATRIX = 0
+SINGLE_NUCLEI_COUNT_MATRIX = 1
diff --git a/src/sctools/count.py b/src/sctools/count.py
index ab3babf..7df748b 100644
--- a/src/sctools/count.py
+++ b/src/sctools/count.py
@@ -135,7 +135,7 @@ def from_sorted_tagged_bam(
         cls,
         bam_file: str,
         gene_name_to_index: Dict[str, int],
-        gene_locations: Dict[str, List[tuple]] = None,
+        chromosomes_gene_locations_extended: Dict[str, List[tuple]] = None,
         cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY,
         molecule_barcode_tag: str = consts.MOLECULE_BARCODE_TAG_KEY,
         gene_name_tag: str = consts.GENE_NAME_TAG_KEY,
@@ -172,7 +172,7 @@ def from_sorted_tagged_bam(
         bam_file : str
             input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that
             order
-        gene_locations : dict
+        chromosomes_gene_locations_extended : dict
             Location of genes by chromosome
             (default = None)
         cell_barcode_tag : str, optional
@@ -251,7 +251,7 @@ def from_sorted_tagged_bam(
         ) in grouped_records_generator:
 
             # modify alignments to include the gene name to the alignments to INTRONIC regions
-            if gene_locations:
+            if chromosomes_gene_locations_extended:
                 alignments = []
                 for alignment in input_alignments:
                     if alignment.has_tag("XF"):
@@ -259,14 +259,23 @@ def from_sorted_tagged_bam(
                         if (
                             alignment.reference_name
                             and aln_type == "INTRONIC"
-                            and alignment.reference_name in gene_locations
+                            and alignment.reference_name
+                            in chromosomes_gene_locations_extended
                         ):
                             gene_name = cls.binary_overlap(
-                                gene_locations[alignment.reference_name],
+                                chromosomes_gene_locations_extended[
+                                    alignment.reference_name
+                                ],
                                 0,
-                                len(gene_locations[alignment.reference_name]) - 1,
+                                len(
+                                    chromosomes_gene_locations_extended[
+                                        alignment.reference_name
+                                    ]
+                                )
+                                - 1,
                                 alignment.reference_start,
                             )
+
                             if gene_name:
                                 alignment.set_tag("GE", gene_name)
                     alignments.append(alignment)
diff --git a/src/sctools/gtf.py b/src/sctools/gtf.py
index 3f17480..cf5ff57 100644
--- a/src/sctools/gtf.py
+++ b/src/sctools/gtf.py
@@ -350,3 +350,56 @@ def extract_extended_gene_names(
         # Sort by starting location
         gene_locations[chromosome].sort(key=lambda x: x[0])
     return gene_locations
+
+
+def extract_gene_exons(
+    files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#"
+) -> Dict[str, List[tuple]]:
+    """Extract extended gene names from GTF file(s) and returns a map from gene names to the the
+    list of exons in the ascending order of the start positions file(s).
+
+    Parameters
+    ----------
+    files : Union[str, List], optional
+        File(s) to read. If '-', read sys.stdin (default = '-')
+    mode : {'r', 'rb'}, optional
+        Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r').
+    header_comment_char : str, optional
+        lines beginning with this character are skipped (default = '#')
+
+    Returns
+    -------
+    Dict[str, List[tuple]]
+        A dictionary of chromosome names mapping to a List of tuples, each containing
+        a the exons in the ascending order of the start positions.
+        Dict[str, List(Tuple((start,end), gene)))
+    """
+    gene_name_to_start_end = dict()
+    for record in Reader(files, mode, header_comment_char).filter(
+        retain_types=["exon"]
+    ):
+        gene_name = record.get_attribute("gene_name")
+        if gene_name is None:
+            raise ValueError(
+                f"Malformed GTF file detected. Record is of type gene but does not have a "
+                f'"gene_name" field: {record}'
+            )
+        if record.chromosome not in gene_name_to_start_end:
+            gene_name_to_start_end[record.chromosome] = dict()
+
+        if gene_name not in gene_name_to_start_end[record.chromosome]:
+            gene_name_to_start_end[record.chromosome][gene_name] = []
+
+        gene_name_to_start_end[record.chromosome][gene_name].append(
+            (record.start, record.end)
+        )
+
+    gene_locations_exons = dict()
+    # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start
+    for chromosome in gene_name_to_start_end:
+        gene_locations_exons[chromosome] = [
+            (locs, key) for key, locs in gene_name_to_start_end[chromosome].items()
+        ]
+        # Sort by starting location
+        gene_locations_exons[chromosome].sort(key=lambda x: x[0])
+    return gene_locations_exons
diff --git a/src/sctools/platform.py b/src/sctools/platform.py
index 877b1f7..2d746ad 100644
--- a/src/sctools/platform.py
+++ b/src/sctools/platform.py
@@ -447,7 +447,7 @@ def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int:
         matrix = count.CountMatrix.from_sorted_tagged_bam(
             bam_file=args.bam_file,
             gene_name_to_index=gene_name_to_index,
-            gene_locations=gene_locations,
+            chromosomes_gene_locations_extended=gene_locations,
             cell_barcode_tag=args.cell_barcode_tag,
             molecule_barcode_tag=args.molecule_barcode_tag,
             gene_name_tag=args.gene_name_tag,
diff --git a/src/sctools/test/test_barcode.py b/src/sctools/test/test_barcode.py
index fd022b5..f9fa39a 100644
--- a/src/sctools/test/test_barcode.py
+++ b/src/sctools/test/test_barcode.py
@@ -144,8 +144,7 @@ def tagged_bamfile():
         outbam,
     ]
     platform.TenXV2.attach_barcodes(args)
-    yield outbam
-    os.remove(outbam)
+    return outbam
 
 
 def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x):
diff --git a/src/sctools/test/test_count.py b/src/sctools/test/test_count.py
index eb04d06..bfd3cbe 100644
--- a/src/sctools/test/test_count.py
+++ b/src/sctools/test/test_count.py
@@ -56,6 +56,7 @@
 from typing import Callable, Optional, List, Set, Tuple, Dict, Generator
 
 import numpy as np
+import scipy.sparse as sp
 import pysam
 import pytest
 
@@ -75,6 +76,10 @@
 _test_num_multiple_gene_alignments = 20
 _test_max_gene_hits_per_multiple_gene_alignments = 5
 
+_test_num_only_exons = 10
+_test_num_only_introns = 10
+_test_both_exons_introns = 10
+
 
 @pytest.fixture(scope="module")
 def gene_name_to_index() -> Dict[str, int]:
@@ -89,10 +94,12 @@ def __init__(
         cell_barcode: Optional[str],
         molecule_barcode: Optional[str],
         gene_name: Optional[str],
+        alignment_location: Optional[str] = "EXONIC",
     ) -> None:
         self.cell_barcode = cell_barcode
         self.molecule_barcode = molecule_barcode
         self.gene_name = gene_name
+        self.alignment_location = alignment_location
 
     def __hash__(self):
         return hash((self.cell_barcode, self.molecule_barcode, self.gene_name))
@@ -101,7 +108,8 @@ def __repr__(self):
         return (
             f"{consts.CELL_BARCODE_TAG_KEY}: {self.cell_barcode}, "
             f"{consts.MOLECULE_BARCODE_TAG_KEY}: {self.molecule_barcode}, "
-            f"{consts.GENE_NAME_TAG_KEY}: {self.gene_name}"
+            f"{consts.GENE_NAME_TAG_KEY}: {self.gene_name}",
+            f"{consts.ALIGNMENT_LOCATION_TAG_KEY}: {self.alignment_location}",
         )
 
 
@@ -202,6 +210,7 @@ def __init__(
             k for k, v in sorted(gene_name_to_index.items(), key=operator.itemgetter(1))
         ]
         self.num_genes = len(self.all_gene_names)
+
         self.max_genes = max_genes
         assert (
             max_genes <= self.num_genes
@@ -439,6 +448,8 @@ def _generate_aligned_segment_from_tags(
         i_query: int,
         num_queries: int,
         rng: np.random.RandomState,
+        record_reference_id: Optional[int] = 0,
+        reference_start: Optional[int] = -1,
     ) -> pysam.AlignedSegment:
         """Generates pysam.AlignedSegment instances from alignment_tags.
 
@@ -476,14 +487,31 @@ def _generate_aligned_segment_from_tags(
             )
         if alignment_tags.gene_name:
             tags.append((consts.GENE_NAME_TAG_KEY, alignment_tags.gene_name, "Z"))
+
+        if alignment_tags.alignment_location:
+            tags.append(
+                (
+                    consts.ALIGNMENT_LOCATION_TAG_KEY,
+                    alignment_tags.alignment_location,
+                    "Z",
+                )
+            )
+
         record = pysam.AlignedSegment()
         record.query_name = SyntheticTaggedBAMGenerator._generate_query_name(
             query_prefix, i_query, num_queries
         )
-        record.reference_start = rng.randint(
-            low=0, high=SyntheticTaggedBAMGenerator.SYNTHETIC_SEQUENCE_LENGTH
+
+        if reference_start == -1:
+            record.reference_start = rng.randint(
+                low=0, high=SyntheticTaggedBAMGenerator.SYNTHETIC_SEQUENCE_LENGTH
+            )
+        else:
+            record.reference_start = reference_start
+
+        record.reference_id = (
+            record_reference_id  # note: we only use one synthetic sequence
         )
-        record.reference_id = 0  # note: we only use one synthetic sequence
         if len(tags) > 0:
             record.set_tags(tags)
         return record
@@ -813,3 +841,519 @@ def test_count_matrix_from_bam(
             )
         ]
     )
+
+
+def extract_gene_non_exons(
+    chromosome_gene_exons: Dict[str, List[tuple]],
+    chromosome_gene_locations_extended: Dict[str, List[tuple]],
+) -> Dict[str, Dict[str, List[tuple]]]:
+
+    chromosome_gene_non_exons = {}
+
+    for chromosome in chromosome_gene_exons:
+        chromosome_gene_non_exons[chromosome] = {}
+        gene_name_exon_list = {}
+        for gene_exons in chromosome_gene_exons[chromosome]:
+            gene_name_exon_list[gene_exons[1]] = gene_exons[0]
+
+        gene_name_location_dict = {}
+        for gene_locations in chromosome_gene_locations_extended[chromosome]:
+            gene_name_location_dict[gene_locations[1]] = gene_locations[0]
+
+        for gene_name in gene_name_location_dict:
+            non_exon_list = []
+            if gene_name in gene_name_exon_list:
+
+                start, end = gene_name_location_dict[gene_name]
+                coords = gene_name_exon_list[gene_name]
+                coords.sort(key=lambda a: a[0])
+
+                x = start
+                y = coords[0][0] - 1
+                i = 0
+
+                n = len(coords)
+                while i < n:
+                    if y <= coords[i][0]:
+                        if x < y:
+                            non_exon_list.append((x, y))
+                        x = coords[i][1]
+                    else:
+                        x = max(x, coords[i][1])
+
+                    if i < n - 1:
+                        y = min(end, coords[i + 1][0])
+                    i += 1
+            chromosome_gene_non_exons[chromosome][gene_name] = non_exon_list.copy()
+
+    return chromosome_gene_non_exons
+
+
+@pytest.mark.parametrize(
+    "alignment_sort_order",
+    [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()],
+    ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"],
+)
+def test_count_matrix_with_introns(
+    alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index
+):
+    _count_matrix_with_introns(
+        alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX
+    )
+    _count_matrix_with_introns(
+        alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX
+    )
+
+
+def _count_matrix_with_introns(
+    alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index
+):
+
+    chromosomes_gene_locations_extended = gtf.extract_extended_gene_names(
+        _test_annotation_file
+    )
+    chromosomes_gene_exons = gtf.extract_gene_exons(_test_annotation_file)
+
+    _test_chromosomes_gene_non_exons = extract_gene_non_exons(
+        chromosomes_gene_exons, chromosomes_gene_locations_extended
+    )
+
+    _test_chromosomes_gene_exons = {}
+    for chromosome in chromosomes_gene_exons:
+        _test_chromosomes_gene_exons[chromosome] = {}
+        for gene_exons in chromosomes_gene_exons[chromosome]:
+            _test_chromosomes_gene_exons[chromosome][gene_exons[1]] = gene_exons[0]
+
+    # instantiate a test data generator
+    chromosome = list(_test_chromosomes_gene_exons.keys())[0]
+
+    synthetic_data_generator = SyntheticTaggedAlignmentTypeBAMGenerator(
+        _test_num_cells,
+        _test_max_genes,
+        _test_chromosomes_gene_exons[chromosome],
+        _test_chromosomes_gene_non_exons[chromosome],
+    )
+
+    _test_temp_dir = tempfile.TemporaryDirectory()
+    try:
+        # generate test data
+        synthetic_data_generator.generate_synthetic_bam_and_counts_matrix(
+            _test_temp_dir.name,
+            gene_name_to_index,
+            test_index,
+            alignment_sort_order=alignment_sort_order,
+        )
+
+        # test data paths
+        test_bam_path = os.path.join(
+            _test_temp_dir.name,
+            SyntheticTaggedAlignmentTypeBAMGenerator.bam_output_filename,
+        )
+        test_count_matrix_path = os.path.join(
+            _test_temp_dir.name,
+            SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename,
+        )
+        test_row_index_path = os.path.join(
+            _test_temp_dir.name,
+            SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename,
+        )
+        test_col_index_path = os.path.join(
+            _test_temp_dir.name,
+            SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename,
+        )
+        # create CountMatrix from the synthetic bam
+        if test_index == consts.SINGLE_CELL_COUNT_MATRIX:
+            count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam(
+                test_bam_path, gene_name_to_index
+            )
+        if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX:
+            count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam(
+                test_bam_path,
+                gene_name_to_index,
+                chromosomes_gene_locations_extended=chromosomes_gene_locations_extended,
+            )
+
+        # load the test counts matrix
+        _count_matrix_data_expected = sp.csr_matrix(np.load(test_count_matrix_path))
+        row_index_expected = np.load(test_row_index_path)
+        col_index_expected = np.load(test_col_index_path)
+
+        count_matrix_data_expected = CountMatrix(
+            _count_matrix_data_expected, row_index_expected, col_index_expected
+        )
+        count_matrix_data_expected = count_matrix_data_expected.matrix.todense()
+
+    finally:
+        _test_temp_dir.cleanup()
+
+    count_matrix_data_from_bam = count_matrix_from_bam.matrix.todense()
+    row_index_from_bam = count_matrix_from_bam.row_index
+    col_index_from_bam = count_matrix_from_bam.col_index
+
+    # sort expected and from_bam results by their respective row and column indices, since their sort order
+    # is not part of the design specs and is considered arbitrary
+    (
+        sorted_count_matrix_data_from_bam,
+        sorted_row_index_from_bam,
+        sorted_col_index_from_bam,
+    ) = _get_sorted_count_matrix(
+        count_matrix_data_from_bam, row_index_from_bam, col_index_from_bam
+    )
+    (
+        sorted_count_matrix_data_expected,
+        sorted_row_index_expected,
+        sorted_col_index_expected,
+    ) = _get_sorted_count_matrix(
+        count_matrix_data_expected, row_index_expected, col_index_expected
+    )
+
+    assert all(
+        [
+            row_name_from_bam == row_name_expected
+            for row_name_from_bam, row_name_expected in zip(
+                sorted_row_index_from_bam, sorted_row_index_expected
+            )
+        ]
+    )
+    assert all(
+        [
+            col_name_from_bam == col_name_expected
+            for col_name_from_bam, col_name_expected in zip(
+                sorted_col_index_from_bam, sorted_col_index_expected
+            )
+        ]
+    )
+
+    assert np.allclose(
+        sorted_count_matrix_data_from_bam, sorted_count_matrix_data_expected
+    )
+
+
+class SyntheticTaggedAlignmentTypeBAMGenerator:
+    """This class generates a synthetic count matrix and an accompanying synthetic tagged BAM file as
+    described in the preamble documentation block.
+
+    Parameters
+    ----------
+    num_cells : int
+        number of real cells
+    max-genes : int
+        maximum number of genes to use to generate synthetic counts
+    chromosomes_gene_exons : Dict[str, Dict[str, List[tuple]]]
+        keys at the first level refers to chromosome number, keys at the
+        second level refers to a gene and with the list of exonic regions as values
+    chromosomes_gene_non_exons : Dict[str, Dict[str, List[tuple]]]
+        keys at the first level refers to chromosome number, keys at the
+        second level refers to a gene and with the list of intronic regions as values
+
+    rng_seed : int
+        random number generator seed
+
+    Methods
+    -------
+    generate_synthetic_bam_and_counts_matrix
+        generates synthetic test data and writes the output to disk
+
+    See Also
+    --------
+    count.from_sorted_tagged_bam
+    """
+
+    OUTPUT_PREFIX = "intronic_"
+    SYNTHETIC_SEQUENCE_LENGTH = 5
+    REFERENCE_SEQUENCE_NAME = "1"
+    # EXONIC_SEQUENCE_NAME = "EXONIC_SEQUENCE"
+    SYNTHETIC_SEQUENCE_LENGTH = 100
+
+    bam_output_filename = OUTPUT_PREFIX + "records.bam"
+    count_matrix_output_filename = OUTPUT_PREFIX + "count_matrix.npy"
+    row_index_output_filename = OUTPUT_PREFIX + "_row_index.npy"
+    col_index_output_filename = OUTPUT_PREFIX + "_col_index.npy"
+
+    def __init__(
+        self,
+        num_cells: int,
+        max_genes: int,
+        chromosomes_gene_exons: Dict[str, Dict[str, List[tuple]]],
+        chromosomes_gene_non_exons: Dict[str, List[tuple]],
+        rng_seed: int = 777,
+    ) -> None:
+        self.num_cells = num_cells
+
+        self.chromosomes_gene_exons = chromosomes_gene_exons
+        self.chromosomes_gene_non_exons = chromosomes_gene_non_exons
+
+        # initialize the random number generator
+        self.rng: np.random.RandomState = np.random.RandomState(seed=rng_seed)
+
+        # generate gene names
+        self.all_gene_names = list(self.chromosomes_gene_exons.keys())[:max_genes]
+        self.num_genes = len(self.all_gene_names)
+
+        self.max_genes = max_genes
+        assert (
+            max_genes <= self.num_genes
+        ), f"Max genes ({self.max_genes}) must be <= to all annotated genes ({self.num_genes})"
+        self.to_be_used_gene_indices: List[int] = self.rng.choice(
+            np.arange(0, self.num_genes, dtype=np.int),
+            size=self.max_genes,
+            replace=False,
+        ).tolist()
+        self.to_be_used_gene_names = [
+            self.all_gene_names[j] for j in self.to_be_used_gene_indices
+        ]
+
+    def _generate_random_cell_barcode(self, length: int = 16):
+        return self._generate_random_genomic_sequences(length)
+
+    def _generate_random_molecule_barcode(self, length: int = 10):
+        return self._generate_random_genomic_sequences(length)
+
+    def _generate_random_genomic_sequences(self, length: int):
+        return "".join(self.rng.choice(["A", "C", "T", "G"], size=length))
+
+    def _generate_location_based_tag_list(
+        self, num_alignments: int, gene_names: List[str], alignment_location: str
+    ):
+        alignment_record_tags = []
+        for i in range(num_alignments):
+            alignment_record_tags.append(
+                AlignmentRecordTags(
+                    self._generate_random_cell_barcode(),
+                    self._generate_random_molecule_barcode(),
+                    gene_names[i],
+                    alignment_location,
+                )
+            )
+
+        return alignment_record_tags
+
+    def _add_alignment_start_coordinates(self, alignment_tags, alignment_location):
+        _alignment_tags = []
+
+        for alignment_tag in alignment_tags:
+            if alignment_location == "EXONIC":
+                if alignment_tag.gene_name in self.chromosomes_gene_exons:
+                    coord = self.chromosomes_gene_exons[alignment_tag.gene_name]
+                    setattr(alignment_tag, "coordinate", coord[0][0] + 1)
+                    _alignment_tags.append(alignment_tag)
+
+            if alignment_location == "INTRONIC":
+                if alignment_tag.gene_name in self.chromosomes_gene_non_exons:
+                    coord = self.chromosomes_gene_non_exons[alignment_tag.gene_name]
+                    if coord:
+                        setattr(alignment_tag, "coordinate", coord[0][0] + 1)
+                        alignment_tag.gene_name = ""
+                        _alignment_tags.append(alignment_tag)
+
+        return _alignment_tags
+
+    def generate_synthetic_bam_and_counts_matrix(
+        self,
+        output_path: str,
+        gene_name_to_index: int,
+        test_index: int,
+        alignment_sort_order: bam.AlignmentSortOrder = CellMoleculeGeneQueryNameSortOrder(),
+    ):
+        """Generates synthetic count matrix and BAM file and writes them to disk.
+
+            Parameters
+            ----------
+            output_path : str
+                output path
+            gene_name_to_index : Dict[str, int]
+                gene name to an index
+            test_index : int
+                0 for single cell matrix and 1 for single nuclei matrix
+            alignment_sort_order : bam.AlignmentSortOrder
+                sort order of BAM alignment records; if 'None', random sort order is implied
+
+            Returns
+            -------
+            None
+            """
+
+        gene_names_alignments = []
+
+        for gene_name in sorted(self.chromosomes_gene_non_exons.keys()):
+            if self.chromosomes_gene_non_exons[gene_name]:
+                gene_names_alignments.append(gene_name)
+
+        gene_names: List[int] = []
+        cell_ids: List[int] = []
+
+        records = []
+        # Only exons, expected in both single-cell and single-nuclei modes
+        exonic_alignment_tags = self._generate_location_based_tag_list(
+            10, gene_names_alignments[0:], "EXONIC"
+        )
+        exonic_alignment_tags = self._add_alignment_start_coordinates(
+            exonic_alignment_tags, "EXONIC"
+        )
+
+        for i, alignment_tag in enumerate(exonic_alignment_tags):
+            pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags(
+                alignment_tag,
+                "EXONIC",
+                i,
+                10,
+                self.rng,
+                reference_start=alignment_tag.coordinate,
+            )
+            records.append(pysam_alignment)
+            gene_names.append(alignment_tag.gene_name)
+            cell_ids.append(alignment_tag.cell_barcode)
+
+        "Only introns only in single-nuclei mode"
+        intronic_alignment_tags = self._generate_location_based_tag_list(
+            3, gene_names_alignments[10:], "INTRONIC"
+        )
+        intronic_alignment_tags = self._add_alignment_start_coordinates(
+            intronic_alignment_tags, "INTRONIC"
+        )
+        for i, alignment_tag in enumerate(intronic_alignment_tags):
+            pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags(
+                alignment_tag,
+                "INTRONIC",
+                i + 10,
+                10,
+                self.rng,
+                reference_start=alignment_tag.coordinate,
+            )
+            records.append(pysam_alignment)
+            if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX:
+                gene_names.append(gene_names_alignments[i + 10])
+                cell_ids.append(alignment_tag.cell_barcode)
+
+        "both intron and exons from the same gene in bost single-cell and single-nuclei modes"
+        exonic_alignment_tags = self._generate_location_based_tag_list(
+            10, gene_names_alignments[20:], "EXONIC"
+        )
+        exonic_alignment_tags = self._add_alignment_start_coordinates(
+            exonic_alignment_tags, "EXONIC"
+        )
+
+        _intronic_alignment_tags = self._generate_location_based_tag_list(
+            10, gene_names_alignments[20:], "INTRONIC"
+        )
+        intronic_alignment_tags = []
+        for intronic_tag, exonic_tag in zip(
+            _intronic_alignment_tags, exonic_alignment_tags
+        ):
+            intronic_tag.cell_barcode = exonic_tag.cell_barcode
+            intronic_alignment_tags.append(intronic_tag)
+        intronic_alignment_tags = self._add_alignment_start_coordinates(
+            intronic_alignment_tags, "INTRONIC"
+        )
+
+        for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate(
+            zip(exonic_alignment_tags, intronic_alignment_tags)
+        ):
+            pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags(
+                exonic_alignment_tag,
+                "EXONINTRONSAME",
+                i + 20,
+                10,
+                self.rng,
+                reference_start=exonic_alignment_tag.coordinate,
+            )
+            records.append(pysam_alignment)
+
+            pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags(
+                intronic_alignment_tag,
+                "EXONINTRONSAME",
+                i + 20,
+                10,
+                self.rng,
+                reference_start=intronic_alignment_tag.coordinate,
+            )
+            records.append(pysam_alignment)
+            cell_ids.append(exonic_alignment_tag.cell_barcode)
+            gene_names.append(exonic_alignment_tag.gene_name)
+
+        # both intron and exons from separate genes should not appear in single-cell mode
+        exonic_alignment_tags = self._generate_location_based_tag_list(
+            10, gene_names_alignments[30:], "EXONIC"
+        )
+        exonic_alignment_tags = self._add_alignment_start_coordinates(
+            exonic_alignment_tags, "EXONIC"
+        )
+
+        _intronic_alignment_tags = self._generate_location_based_tag_list(
+            10, gene_names_alignments[31:], "INTRONIC"
+        )
+        intronic_alignment_tags = []
+        for intronic_tag, exonic_tag in zip(
+            _intronic_alignment_tags, exonic_alignment_tags
+        ):
+            intronic_tag.cell_barcode = exonic_tag.cell_barcode
+            intronic_alignment_tags.append(intronic_tag)
+        intronic_alignment_tags = self._add_alignment_start_coordinates(
+            intronic_alignment_tags, "INTRONIC"
+        )
+
+        for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate(
+            zip(exonic_alignment_tags, intronic_alignment_tags)
+        ):
+            pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags(
+                exonic_alignment_tag,
+                "EXONINTRONSEP",
+                i + 30,
+                10,
+                self.rng,
+                reference_start=exonic_alignment_tag.coordinate,
+            )
+            records.append(pysam_alignment)
+
+            pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags(
+                intronic_alignment_tag,
+                "EXONINTRONSEP",
+                i + 30,
+                10,
+                self.rng,
+                reference_start=intronic_alignment_tag.coordinate,
+            )
+            records.append(pysam_alignment)
+
+            if test_index == consts.SINGLE_CELL_COUNT_MATRIX:
+                cell_ids.append(exonic_alignment_tag.cell_barcode)
+                gene_names.append(exonic_alignment_tag.gene_name)
+
+        # write BAM file
+        with pysam.AlignmentFile(
+            os.path.join(output_path, self.bam_output_filename),
+            mode="wb",
+            reference_names=[self.REFERENCE_SEQUENCE_NAME],
+            reference_lengths=[self.SYNTHETIC_SEQUENCE_LENGTH],
+        ) as bo:
+            for record in records:
+                bo.write(record)
+
+        n_genes = len(gene_name_to_index)
+        n_data = len(cell_ids)
+        # write count matrix, row index, and col index
+        count_matrix = np.zeros((n_data, n_genes), dtype=np.int32)
+        for i, (cell_id, gene_name) in enumerate(zip(cell_ids, gene_names)):
+            count_matrix[i][gene_name_to_index[gene_name]] = 1
+
+        test_count_matrix_path = os.path.join(
+            output_path,
+            SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename,
+        )
+        test_row_index_path = os.path.join(
+            output_path,
+            SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename,
+        )
+        test_col_index_path = os.path.join(
+            output_path,
+            SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename,
+        )
+
+        np.save(test_count_matrix_path, count_matrix)
+        np.save(test_row_index_path, cell_ids)
+        gene_rank = [(gene, rank) for gene, rank in gene_name_to_index.items()]
+        gene_rank.sort(key=lambda x: x[1])
+        gene_names = [x[0] for x in gene_rank]
+        np.save(test_col_index_path, gene_names)
+
+        return os.path.join(output_path, self.bam_output_filename)
diff --git a/src/sctools/test/test_encodings.py b/src/sctools/test/test_encodings.py
index d1d3dd9..1eeb458 100644
--- a/src/sctools/test/test_encodings.py
+++ b/src/sctools/test/test_encodings.py
@@ -3,23 +3,23 @@
 from itertools import combinations
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def sequence():
-    return b'ACGTTTGAGATGAGATATAGANNNN'
+    return b"ACGTTTGAGATGAGATATAGANNNN"
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def encoder_2bit(sequence):
     length = len(sequence)
     return encodings.TwoBit(length)
 
 
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
 def encoder_3bit():
     return encodings.ThreeBit()
 
 
-@pytest.fixture(scope='module', params=[encodings.TwoBit, encodings.ThreeBit])
+@pytest.fixture(scope="module", params=[encodings.TwoBit, encodings.ThreeBit])
 def encoder(request):
     return request.param
 
@@ -39,34 +39,34 @@ def test_three_bit_encode_decode_produces_same_string(sequence, encoder_3bit):
 
 
 def test_two_bit_encoder_gets_correct_gc_content(encoder_2bit):
-    sequence_no_n = b'AGCGCGAT'
-    gc_content = sequence_no_n.count(b'C') + sequence_no_n.count(b'G')
+    sequence_no_n = b"AGCGCGAT"
+    gc_content = sequence_no_n.count(b"C") + sequence_no_n.count(b"G")
     encoded = encoder_2bit.encode(sequence_no_n)
     assert encoder_2bit.gc_content(encoded) == gc_content
 
 
 def test_three_bit_encoder_gets_correct_gc_content(sequence, encoder_3bit):
     encoded = encoder_3bit.encode(sequence)
-    assert encoder_3bit.gc_content(encoded) == sequence.count(b'C') + sequence.count(
-        b'G'
+    assert encoder_3bit.gc_content(encoded) == sequence.count(b"C") + sequence.count(
+        b"G"
     )
 
 
 def test_two_bit_throws_errors_when_asked_to_encode_unknown_nucleotide(encoder_2bit):
     with pytest.raises(KeyError):
-        encoder_2bit.encode(b'ACGTP')  # P is not a valid code
+        encoder_2bit.encode(b"ACGTP")  # P is not a valid code
 
 
 def test_three_bit_encodes_unknown_nucleotides_as_N(encoder_3bit):
-    encoded = encoder_3bit.encode(b'ACGTP')  # P is not a valid code
+    encoded = encoder_3bit.encode(b"ACGTP")  # P is not a valid code
     decoded = encoder_3bit.decode(encoded)
-    assert decoded == b'ACGTN'
+    assert decoded == b"ACGTN"
 
 
 @pytest.fixture
 def simple_barcodes():
     """simple barcode set with min_hamming = 1, max_hamming = 2"""
-    return [b'ACGT', b'ACGG', b'ACGA', b'ACGC', b'TCGT', b'CCGT', b'GCGT']
+    return [b"ACGT", b"ACGG", b"ACGA", b"ACGC", b"TCGT", b"CCGT", b"GCGT"]
 
 
 @pytest.fixture
diff --git a/src/sctools/test/test_entrypoints.py b/src/sctools/test/test_entrypoints.py
index 36143c3..419b325 100644
--- a/src/sctools/test/test_entrypoints.py
+++ b/src/sctools/test/test_entrypoints.py
@@ -9,24 +9,24 @@
 
 from sctools import bam, platform, count, consts
 
-data_dir = os.path.split(__file__)[0] + '/data/'
+data_dir = os.path.split(__file__)[0] + "/data/"
 
 
 def test_Attach10XBarcodes_entrypoint():
     args = [
-        '--r1',
-        data_dir + 'test_r1.fastq',
-        '--i1',
-        data_dir + 'test_i7.fastq',
-        '--u2',
-        data_dir + 'test.bam',
-        '--output-bamfile',
-        'test_tagged_bam.bam',
+        "--r1",
+        data_dir + "test_r1.fastq",
+        "--i1",
+        data_dir + "test_i7.fastq",
+        "--u2",
+        data_dir + "test.bam",
+        "--output-bamfile",
+        "test_tagged_bam.bam",
     ]
 
     rc = platform.TenXV2.attach_barcodes(args)
     assert rc == 0
-    with pysam.AlignmentFile('test_tagged_bam.bam', 'rb', check_sq=False) as f:
+    with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f:
         for alignment in f:
             # each alignment should now have a tag, and that tag should be a string
             assert isinstance(
@@ -43,27 +43,27 @@ def test_Attach10XBarcodes_entrypoint():
             assert isinstance(
                 alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str
             )
-    os.remove('test_tagged_bam.bam')  # clean up
+    os.remove("test_tagged_bam.bam")  # clean up
 
 
 def test_Attach10XBarcodes_entrypoint_with_whitelist():
     args = [
-        '--r1',
-        data_dir + 'test_r1.fastq',
-        '--i1',
-        data_dir + 'test_i7.fastq',
-        '--u2',
-        data_dir + 'test.bam',
-        '--output-bamfile',
-        'test_tagged_bam.bam',
-        '--whitelist',
-        data_dir + '1k-august-2016.txt',
+        "--r1",
+        data_dir + "test_r1.fastq",
+        "--i1",
+        data_dir + "test_i7.fastq",
+        "--u2",
+        data_dir + "test.bam",
+        "--output-bamfile",
+        "test_tagged_bam.bam",
+        "--whitelist",
+        data_dir + "1k-august-2016.txt",
     ]
 
     return_call = platform.TenXV2.attach_barcodes(args)
     assert return_call == 0
     success = False
-    with pysam.AlignmentFile('test_tagged_bam.bam', 'rb', check_sq=False) as f:
+    with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f:
         for alignment in f:
             if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY):
                 success = True
@@ -83,23 +83,23 @@ def test_Attach10XBarcodes_entrypoint_with_whitelist():
                 alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str
             )
     assert success
-    os.remove('test_tagged_bam.bam')  # clean up
+    os.remove("test_tagged_bam.bam")  # clean up
 
 
 def test_AttachBarcodes_entrypoint_with_whitelist():
     # test of the BarcodePlatform.attach_barcodes entry point with
     # sample, cell, and molecule barcodes all specified
     args = [
-        '--r1',
-        data_dir + 'test_r1.fastq',
-        '--i1',
-        data_dir + 'test_i7.fastq',
-        '--u2',
-        data_dir + 'test.bam',
-        '--output-bamfile',
-        'test_tagged_bam.bam',
-        '--whitelist',
-        data_dir + '1k-august-2016.txt',
+        "--r1",
+        data_dir + "test_r1.fastq",
+        "--i1",
+        data_dir + "test_i7.fastq",
+        "--u2",
+        data_dir + "test.bam",
+        "--output-bamfile",
+        "test_tagged_bam.bam",
+        "--whitelist",
+        data_dir + "1k-august-2016.txt",
         "--sample-barcode-start-position",
         "0",
         "--sample-barcode-length",
@@ -117,7 +117,7 @@ def test_AttachBarcodes_entrypoint_with_whitelist():
     return_call = platform.BarcodePlatform.attach_barcodes(args)
     assert return_call == 0
     success = False
-    with pysam.AlignmentFile('test_tagged_bam.bam', 'rb', check_sq=False) as f:
+    with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f:
         for alignment in f:
             if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY):
                 success = True
@@ -138,33 +138,33 @@ def test_AttachBarcodes_entrypoint_with_whitelist():
                 alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str
             )
     assert success
-    os.remove('test_tagged_bam.bam')  # clean up
+    os.remove("test_tagged_bam.bam")  # clean up
 
 
 def test_split_bam():
     tag_args = [
-        '--r1',
-        data_dir + 'test_r1.fastq',
-        '--i1',
-        data_dir + 'test_i7.fastq',
-        '--u2',
-        data_dir + 'test.bam',
-        '--output-bamfile',
-        'test_tagged_bam.bam',
-        '--whitelist',
-        data_dir + '1k-august-2016.txt',
+        "--r1",
+        data_dir + "test_r1.fastq",
+        "--i1",
+        data_dir + "test_i7.fastq",
+        "--u2",
+        data_dir + "test.bam",
+        "--output-bamfile",
+        "test_tagged_bam.bam",
+        "--whitelist",
+        data_dir + "1k-august-2016.txt",
     ]
 
     platform.TenXV2.attach_barcodes(tag_args)
 
     split_args = [
-        '--bamfile',
-        'test_tagged_bam.bam',
-        '--output-prefix',
-        'test_tagged',
-        '--subfile-size',
-        '0.005',
-        '--tags',
+        "--bamfile",
+        "test_tagged_bam.bam",
+        "--output-prefix",
+        "test_tagged",
+        "--subfile-size",
+        "0.005",
+        "--tags",
         consts.CELL_BARCODE_TAG_KEY,
         consts.RAW_CELL_BARCODE_TAG_KEY,
     ]
@@ -172,17 +172,17 @@ def test_split_bam():
     return_call = platform.GenericPlatform.split_bam(split_args)
     assert return_call == 0
 
-    for f in glob.glob('test_tagged*'):
+    for f in glob.glob("test_tagged*"):
         os.remove(f)
 
 
 def test_tag_sort_bam():
     args = [
-        '-i',
-        data_dir + 'unsorted.bam',
-        '-o',
-        'test_sorted.bam',
-        '-t',
+        "-i",
+        data_dir + "unsorted.bam",
+        "-o",
+        "test_sorted.bam",
+        "-t",
         consts.CELL_BARCODE_TAG_KEY,
         consts.GENE_NAME_TAG_KEY,
         consts.MOLECULE_BARCODE_TAG_KEY,
@@ -196,28 +196,28 @@ def test_tag_sort_bam():
         consts.GENE_NAME_TAG_KEY,
         consts.MOLECULE_BARCODE_TAG_KEY,
     ]
-    with pysam.AlignmentFile('test_sorted.bam', 'rb') as f:
+    with pysam.AlignmentFile("test_sorted.bam", "rb") as f:
         segments = f.fetch(until_eof=True)
         tag_sortable_records = (
             bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments
         )
         bam.verify_sort(tag_sortable_records, tag_keys)
 
-    for f in glob.glob('test_sorted*'):
+    for f in glob.glob("test_sorted*"):
         os.remove(f)
 
 
 def test_tag_sort_bam_dash_t_specified_multiple_times():
     args = [
-        '-i',
-        data_dir + 'unsorted.bam',
-        '-o',
-        'test_sorted.bam',
-        '-t',
+        "-i",
+        data_dir + "unsorted.bam",
+        "-o",
+        "test_sorted.bam",
+        "-t",
         consts.CELL_BARCODE_TAG_KEY,
-        '-t',
+        "-t",
         consts.GENE_NAME_TAG_KEY,
-        '-t',
+        "-t",
         consts.MOLECULE_BARCODE_TAG_KEY,
     ]
 
@@ -229,40 +229,40 @@ def test_tag_sort_bam_dash_t_specified_multiple_times():
         consts.GENE_NAME_TAG_KEY,
         consts.MOLECULE_BARCODE_TAG_KEY,
     ]
-    with pysam.AlignmentFile('test_sorted.bam', 'rb') as f:
+    with pysam.AlignmentFile("test_sorted.bam", "rb") as f:
         segments = f.fetch(until_eof=True)
         tag_sortable_record_generator = (
             bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments
         )
         bam.verify_sort(tag_sortable_record_generator, tag_keys)
 
-    for f in glob.glob('test_sorted*'):
+    for f in glob.glob("test_sorted*"):
         os.remove(f)
 
 
 def test_tag_sort_bam_no_tags():
-    args = ['-i', data_dir + 'unsorted.bam', '-o', 'test_sorted.bam']
+    args = ["-i", data_dir + "unsorted.bam", "-o", "test_sorted.bam"]
 
     return_call = platform.GenericPlatform.tag_sort_bam(args)
     assert return_call == 0
 
     tag_keys = []
-    with pysam.AlignmentFile('test_sorted.bam', 'rb') as f:
+    with pysam.AlignmentFile("test_sorted.bam", "rb") as f:
         segments = f.fetch(until_eof=True)
         tag_sortable_records = (
             bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments
         )
         bam.verify_sort(tag_sortable_records, tag_keys)
 
-    for f in glob.glob('test_sorted*'):
+    for f in glob.glob("test_sorted*"):
         os.remove(f)
 
 
 def test_verify_bam_sort():
     args = [
-        '-i',
-        data_dir + 'cell-gene-umi-queryname-sorted.bam',
-        '-t',
+        "-i",
+        data_dir + "cell-gene-umi-queryname-sorted.bam",
+        "-t",
         consts.CELL_BARCODE_TAG_KEY,
         consts.GENE_NAME_TAG_KEY,
         consts.MOLECULE_BARCODE_TAG_KEY,
@@ -274,9 +274,9 @@ def test_verify_bam_sort():
 
 def test_verify_bam_sort_raises_error_on_unsorted():
     args = [
-        '-i',
-        data_dir + 'unsorted.bam',
-        '-t',
+        "-i",
+        data_dir + "unsorted.bam",
+        "-t",
         consts.CELL_BARCODE_TAG_KEY,
         consts.GENE_NAME_TAG_KEY,
         consts.MOLECULE_BARCODE_TAG_KEY,
@@ -293,15 +293,15 @@ def test_count_merge():
     matrix = sp.coo_matrix((data, (ind, col)), shape=(10, 10), dtype=np.float32).tocsr()
     # be lazy and reuse the inds as the col and row index
     counts = count.CountMatrix(matrix, ind, col)
-    counts.save(tmp + '/test_input_1')
-    counts.save(tmp + '/test_input_2')
+    counts.save(tmp + "/test_input_1")
+    counts.save(tmp + "/test_input_2")
 
     merge_args = [
-        '-o',
-        tmp + '/test_merged_counts',
-        '-i',
-        tmp + '/test_input_2',
-        tmp + '/test_input_1',
+        "-o",
+        tmp + "/test_merged_counts",
+        "-i",
+        tmp + "/test_input_2",
+        tmp + "/test_input_1",
     ]
     return_call = platform.GenericPlatform.merge_count_matrices(merge_args)
     assert return_call == 0
diff --git a/src/sctools/test/test_gtf.py b/src/sctools/test/test_gtf.py
index 49661bd..fd74ea9 100644
--- a/src/sctools/test/test_gtf.py
+++ b/src/sctools/test/test_gtf.py
@@ -3,42 +3,42 @@
 from itertools import chain
 import pytest
 
-_data_dir = os.path.split(__file__)[0] + '/data'
-_files = ['%s/%s' % (_data_dir, f) for f in ('test.gtf', 'test.gtf.gz', 'test.gtf.bz2')]
+_data_dir = os.path.split(__file__)[0] + "/data"
+_files = ["%s/%s" % (_data_dir, f) for f in ("test.gtf", "test.gtf.gz", "test.gtf.bz2")]
 
 
-@pytest.fixture(scope='module', params=_files)
+@pytest.fixture(scope="module", params=_files)
 def files(request):
     """returns a filename"""
     return request.param
 
 
 def test_opens_file_reads_first_line(files):
-    rd = gtf.Reader(files, 'r', header_comment_char='#')
+    rd = gtf.Reader(files, "r", header_comment_char="#")
     record = next(iter(rd))
     assert isinstance(record, gtf.GTFRecord)
 
 
 def test_opens_file_populates_fields_properly(files):
-    rd = gtf.Reader(files, 'r', header_comment_char='#')
+    rd = gtf.Reader(files, "r", header_comment_char="#")
     record = next(iter(rd))
-    assert record.seqname == 'chr19'
-    assert record.chromosome == 'chr19'
-    assert record.source == 'HAVANA'
-    assert record.feature == 'gene'
+    assert record.seqname == "chr19"
+    assert record.chromosome == "chr19"
+    assert record.source == "HAVANA"
+    assert record.feature == "gene"
     assert record.start == 60951
     assert record.end == 71626
-    assert record.score == '.'
-    assert record.strand == '-'
-    assert record.frame == '.'
+    assert record.score == "."
+    assert record.strand == "-"
+    assert record.frame == "."
 
     expected_features = {
-        'gene_id': 'ENSG00000282458.1',
-        'gene_type': 'transcribed_processed_pseudogene',
-        'gene_status': 'KNOWN',
-        'gene_name': 'WASH5P',
-        'level': '2',
-        'havana_gene': 'OTTHUMG00000180466.8',
+        "gene_id": "ENSG00000282458.1",
+        "gene_type": "transcribed_processed_pseudogene",
+        "gene_status": "KNOWN",
+        "gene_name": "WASH5P",
+        "level": "2",
+        "havana_gene": "OTTHUMG00000180466.8",
     }
     assert record._attributes == expected_features
 
@@ -49,21 +49,21 @@ def test_opens_file_populates_fields_properly(files):
 
 
 def test_set_attribute_verify_included_in_output_string(files):
-    rd = gtf.Reader(files, 'r', header_comment_char='#')
+    rd = gtf.Reader(files, "r", header_comment_char="#")
     record = next(iter(rd))
-    record.set_attribute('test_attr', 'foo')
-    assert record.get_attribute('test_attr') == 'foo'
+    record.set_attribute("test_attr", "foo")
+    assert record.get_attribute("test_attr") == "foo"
 
     # verify in output string
-    assert 'foo' in str(record)
+    assert "foo" in str(record)
 
 
 def test_opens_file_parses_size(files):
-    rd = gtf.Reader(files, 'r', header_comment_char='#')
+    rd = gtf.Reader(files, "r", header_comment_char="#")
     record = next(iter(rd))
     assert 71626 - 60951 == record.size
 
     # mangle record, make sure error is raised
     record._fields[3:5] = [record.end, record.start]
     with pytest.raises(ValueError):
-        getattr(record, 'size')
+        getattr(record, "size")
diff --git a/src/sctools/test/test_platform.py b/src/sctools/test/test_platform.py
index c80c1f7..e18e0cd 100644
--- a/src/sctools/test/test_platform.py
+++ b/src/sctools/test/test_platform.py
@@ -4,7 +4,7 @@
 
 from .. import platform
 
-data_dir = os.path.split(__file__)[0] + '/data/'
+data_dir = os.path.split(__file__)[0] + "/data/"
 
 
 def test_attach_barcodes():
@@ -13,15 +13,15 @@ def test_attach_barcodes():
     temp_dir_name = tempfile.mkdtemp()
 
     # Construct cli arguments to pass to the command
-    temp_output_bam = temp_dir_name + 'output.bam'
+    temp_output_bam = temp_dir_name + "output.bam"
 
     args = [
         "--r1",
-        data_dir + 'test_r1.fastq',
+        data_dir + "test_r1.fastq",
         "--u2",
-        data_dir + 'test_r2.bam',
+        data_dir + "test_r2.bam",
         "--i1",
-        data_dir + 'test_i1.fastq',
+        data_dir + "test_i1.fastq",
         "--o",
         temp_output_bam,
         "--sample-barcode-start-pos",
@@ -42,12 +42,12 @@ def test_attach_barcodes():
 
     with pysam.AlignmentFile(temp_output_bam, "rb", check_sq=False) as samfile:
         for read in samfile:
-            tag_cr = read.get_tag('CR')
-            tag_cy = read.get_tag('CY')
-            tag_ur = read.get_tag('UR')
-            tag_uy = read.get_tag('UY')
-            tag_sr = read.get_tag('SR')
-            tag_sy = read.get_tag('SY')
+            tag_cr = read.get_tag("CR")
+            tag_cy = read.get_tag("CY")
+            tag_ur = read.get_tag("UR")
+            tag_uy = read.get_tag("UY")
+            tag_sr = read.get_tag("SR")
+            tag_sy = read.get_tag("SY")
             assert len(tag_cr) == 16
             assert len(tag_cy) == 16
             assert len(tag_ur) == 4