From a3ec39d33334ab92b5c7907dd5ee9c28fb476249 Mon Sep 17 00:00:00 2001 From: Kishori M Konwar <43380010+kishorikonwar@users.noreply.github.com> Date: Wed, 26 Feb 2020 15:03:48 -0500 Subject: [PATCH] Added a unit test that reproduces the expected count matrix for snRNA-Seq mode in CreateCountMatrix (#71) * intial commit * snRNA-Seq unit test in pytest working * fixed the pytest for snRNA functionallity * corrected the chromosome-gene-exon from gene_locations * Update count.py * addressed the review comments and formatted code * Update count.py Co-authored-by: Nikolas Barkas --- src/sctools/consts.py | 5 + src/sctools/count.py | 21 +- src/sctools/gtf.py | 53 +++ src/sctools/platform.py | 2 +- src/sctools/test/test_barcode.py | 3 +- src/sctools/test/test_count.py | 552 ++++++++++++++++++++++++++- src/sctools/test/test_encodings.py | 26 +- src/sctools/test/test_entrypoints.py | 170 ++++----- src/sctools/test/test_gtf.py | 48 +-- src/sctools/test/test_platform.py | 22 +- 10 files changed, 756 insertions(+), 146 deletions(-) diff --git a/src/sctools/consts.py b/src/sctools/consts.py index 8c2c27b..9a57e08 100644 --- a/src/sctools/consts.py +++ b/src/sctools/consts.py @@ -34,3 +34,8 @@ MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500 MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000 + + +# modes of the count matrix runs +SINGLE_CELL_COUNT_MATRIX = 0 +SINGLE_NUCLEI_COUNT_MATRIX = 1 diff --git a/src/sctools/count.py b/src/sctools/count.py index ab3babf..7df748b 100644 --- a/src/sctools/count.py +++ b/src/sctools/count.py @@ -135,7 +135,7 @@ def from_sorted_tagged_bam( cls, bam_file: str, gene_name_to_index: Dict[str, int], - gene_locations: Dict[str, List[tuple]] = None, + chromosomes_gene_locations_extended: Dict[str, List[tuple]] = None, cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY, molecule_barcode_tag: str = consts.MOLECULE_BARCODE_TAG_KEY, gene_name_tag: str = consts.GENE_NAME_TAG_KEY, @@ -172,7 +172,7 @@ def from_sorted_tagged_bam( bam_file : str input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that order - gene_locations : dict + chromosomes_gene_locations_extended : dict Location of genes by chromosome (default = None) cell_barcode_tag : str, optional @@ -251,7 +251,7 @@ def from_sorted_tagged_bam( ) in grouped_records_generator: # modify alignments to include the gene name to the alignments to INTRONIC regions - if gene_locations: + if chromosomes_gene_locations_extended: alignments = [] for alignment in input_alignments: if alignment.has_tag("XF"): @@ -259,14 +259,23 @@ def from_sorted_tagged_bam( if ( alignment.reference_name and aln_type == "INTRONIC" - and alignment.reference_name in gene_locations + and alignment.reference_name + in chromosomes_gene_locations_extended ): gene_name = cls.binary_overlap( - gene_locations[alignment.reference_name], + chromosomes_gene_locations_extended[ + alignment.reference_name + ], 0, - len(gene_locations[alignment.reference_name]) - 1, + len( + chromosomes_gene_locations_extended[ + alignment.reference_name + ] + ) + - 1, alignment.reference_start, ) + if gene_name: alignment.set_tag("GE", gene_name) alignments.append(alignment) diff --git a/src/sctools/gtf.py b/src/sctools/gtf.py index 3f17480..cf5ff57 100644 --- a/src/sctools/gtf.py +++ b/src/sctools/gtf.py @@ -350,3 +350,56 @@ def extract_extended_gene_names( # Sort by starting location gene_locations[chromosome].sort(key=lambda x: x[0]) return gene_locations + + +def extract_gene_exons( + files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#" +) -> Dict[str, List[tuple]]: + """Extract extended gene names from GTF file(s) and returns a map from gene names to the the + list of exons in the ascending order of the start positions file(s). + + Parameters + ---------- + files : Union[str, List], optional + File(s) to read. If '-', read sys.stdin (default = '-') + mode : {'r', 'rb'}, optional + Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r'). + header_comment_char : str, optional + lines beginning with this character are skipped (default = '#') + + Returns + ------- + Dict[str, List[tuple]] + A dictionary of chromosome names mapping to a List of tuples, each containing + a the exons in the ascending order of the start positions. + Dict[str, List(Tuple((start,end), gene))) + """ + gene_name_to_start_end = dict() + for record in Reader(files, mode, header_comment_char).filter( + retain_types=["exon"] + ): + gene_name = record.get_attribute("gene_name") + if gene_name is None: + raise ValueError( + f"Malformed GTF file detected. Record is of type gene but does not have a " + f'"gene_name" field: {record}' + ) + if record.chromosome not in gene_name_to_start_end: + gene_name_to_start_end[record.chromosome] = dict() + + if gene_name not in gene_name_to_start_end[record.chromosome]: + gene_name_to_start_end[record.chromosome][gene_name] = [] + + gene_name_to_start_end[record.chromosome][gene_name].append( + (record.start, record.end) + ) + + gene_locations_exons = dict() + # For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start + for chromosome in gene_name_to_start_end: + gene_locations_exons[chromosome] = [ + (locs, key) for key, locs in gene_name_to_start_end[chromosome].items() + ] + # Sort by starting location + gene_locations_exons[chromosome].sort(key=lambda x: x[0]) + return gene_locations_exons diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 877b1f7..2d746ad 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -447,7 +447,7 @@ def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int: matrix = count.CountMatrix.from_sorted_tagged_bam( bam_file=args.bam_file, gene_name_to_index=gene_name_to_index, - gene_locations=gene_locations, + chromosomes_gene_locations_extended=gene_locations, cell_barcode_tag=args.cell_barcode_tag, molecule_barcode_tag=args.molecule_barcode_tag, gene_name_tag=args.gene_name_tag, diff --git a/src/sctools/test/test_barcode.py b/src/sctools/test/test_barcode.py index fd022b5..f9fa39a 100644 --- a/src/sctools/test/test_barcode.py +++ b/src/sctools/test/test_barcode.py @@ -144,8 +144,7 @@ def tagged_bamfile(): outbam, ] platform.TenXV2.attach_barcodes(args) - yield outbam - os.remove(outbam) + return outbam def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x): diff --git a/src/sctools/test/test_count.py b/src/sctools/test/test_count.py index eb04d06..bfd3cbe 100644 --- a/src/sctools/test/test_count.py +++ b/src/sctools/test/test_count.py @@ -56,6 +56,7 @@ from typing import Callable, Optional, List, Set, Tuple, Dict, Generator import numpy as np +import scipy.sparse as sp import pysam import pytest @@ -75,6 +76,10 @@ _test_num_multiple_gene_alignments = 20 _test_max_gene_hits_per_multiple_gene_alignments = 5 +_test_num_only_exons = 10 +_test_num_only_introns = 10 +_test_both_exons_introns = 10 + @pytest.fixture(scope="module") def gene_name_to_index() -> Dict[str, int]: @@ -89,10 +94,12 @@ def __init__( cell_barcode: Optional[str], molecule_barcode: Optional[str], gene_name: Optional[str], + alignment_location: Optional[str] = "EXONIC", ) -> None: self.cell_barcode = cell_barcode self.molecule_barcode = molecule_barcode self.gene_name = gene_name + self.alignment_location = alignment_location def __hash__(self): return hash((self.cell_barcode, self.molecule_barcode, self.gene_name)) @@ -101,7 +108,8 @@ def __repr__(self): return ( f"{consts.CELL_BARCODE_TAG_KEY}: {self.cell_barcode}, " f"{consts.MOLECULE_BARCODE_TAG_KEY}: {self.molecule_barcode}, " - f"{consts.GENE_NAME_TAG_KEY}: {self.gene_name}" + f"{consts.GENE_NAME_TAG_KEY}: {self.gene_name}", + f"{consts.ALIGNMENT_LOCATION_TAG_KEY}: {self.alignment_location}", ) @@ -202,6 +210,7 @@ def __init__( k for k, v in sorted(gene_name_to_index.items(), key=operator.itemgetter(1)) ] self.num_genes = len(self.all_gene_names) + self.max_genes = max_genes assert ( max_genes <= self.num_genes @@ -439,6 +448,8 @@ def _generate_aligned_segment_from_tags( i_query: int, num_queries: int, rng: np.random.RandomState, + record_reference_id: Optional[int] = 0, + reference_start: Optional[int] = -1, ) -> pysam.AlignedSegment: """Generates pysam.AlignedSegment instances from alignment_tags. @@ -476,14 +487,31 @@ def _generate_aligned_segment_from_tags( ) if alignment_tags.gene_name: tags.append((consts.GENE_NAME_TAG_KEY, alignment_tags.gene_name, "Z")) + + if alignment_tags.alignment_location: + tags.append( + ( + consts.ALIGNMENT_LOCATION_TAG_KEY, + alignment_tags.alignment_location, + "Z", + ) + ) + record = pysam.AlignedSegment() record.query_name = SyntheticTaggedBAMGenerator._generate_query_name( query_prefix, i_query, num_queries ) - record.reference_start = rng.randint( - low=0, high=SyntheticTaggedBAMGenerator.SYNTHETIC_SEQUENCE_LENGTH + + if reference_start == -1: + record.reference_start = rng.randint( + low=0, high=SyntheticTaggedBAMGenerator.SYNTHETIC_SEQUENCE_LENGTH + ) + else: + record.reference_start = reference_start + + record.reference_id = ( + record_reference_id # note: we only use one synthetic sequence ) - record.reference_id = 0 # note: we only use one synthetic sequence if len(tags) > 0: record.set_tags(tags) return record @@ -813,3 +841,519 @@ def test_count_matrix_from_bam( ) ] ) + + +def extract_gene_non_exons( + chromosome_gene_exons: Dict[str, List[tuple]], + chromosome_gene_locations_extended: Dict[str, List[tuple]], +) -> Dict[str, Dict[str, List[tuple]]]: + + chromosome_gene_non_exons = {} + + for chromosome in chromosome_gene_exons: + chromosome_gene_non_exons[chromosome] = {} + gene_name_exon_list = {} + for gene_exons in chromosome_gene_exons[chromosome]: + gene_name_exon_list[gene_exons[1]] = gene_exons[0] + + gene_name_location_dict = {} + for gene_locations in chromosome_gene_locations_extended[chromosome]: + gene_name_location_dict[gene_locations[1]] = gene_locations[0] + + for gene_name in gene_name_location_dict: + non_exon_list = [] + if gene_name in gene_name_exon_list: + + start, end = gene_name_location_dict[gene_name] + coords = gene_name_exon_list[gene_name] + coords.sort(key=lambda a: a[0]) + + x = start + y = coords[0][0] - 1 + i = 0 + + n = len(coords) + while i < n: + if y <= coords[i][0]: + if x < y: + non_exon_list.append((x, y)) + x = coords[i][1] + else: + x = max(x, coords[i][1]) + + if i < n - 1: + y = min(end, coords[i + 1][0]) + i += 1 + chromosome_gene_non_exons[chromosome][gene_name] = non_exon_list.copy() + + return chromosome_gene_non_exons + + +@pytest.mark.parametrize( + "alignment_sort_order", + [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], + ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], +) +def test_count_matrix_with_introns( + alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index +): + _count_matrix_with_introns( + alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX + ) + _count_matrix_with_introns( + alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX + ) + + +def _count_matrix_with_introns( + alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index +): + + chromosomes_gene_locations_extended = gtf.extract_extended_gene_names( + _test_annotation_file + ) + chromosomes_gene_exons = gtf.extract_gene_exons(_test_annotation_file) + + _test_chromosomes_gene_non_exons = extract_gene_non_exons( + chromosomes_gene_exons, chromosomes_gene_locations_extended + ) + + _test_chromosomes_gene_exons = {} + for chromosome in chromosomes_gene_exons: + _test_chromosomes_gene_exons[chromosome] = {} + for gene_exons in chromosomes_gene_exons[chromosome]: + _test_chromosomes_gene_exons[chromosome][gene_exons[1]] = gene_exons[0] + + # instantiate a test data generator + chromosome = list(_test_chromosomes_gene_exons.keys())[0] + + synthetic_data_generator = SyntheticTaggedAlignmentTypeBAMGenerator( + _test_num_cells, + _test_max_genes, + _test_chromosomes_gene_exons[chromosome], + _test_chromosomes_gene_non_exons[chromosome], + ) + + _test_temp_dir = tempfile.TemporaryDirectory() + try: + # generate test data + synthetic_data_generator.generate_synthetic_bam_and_counts_matrix( + _test_temp_dir.name, + gene_name_to_index, + test_index, + alignment_sort_order=alignment_sort_order, + ) + + # test data paths + test_bam_path = os.path.join( + _test_temp_dir.name, + SyntheticTaggedAlignmentTypeBAMGenerator.bam_output_filename, + ) + test_count_matrix_path = os.path.join( + _test_temp_dir.name, + SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename, + ) + test_row_index_path = os.path.join( + _test_temp_dir.name, + SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename, + ) + test_col_index_path = os.path.join( + _test_temp_dir.name, + SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename, + ) + # create CountMatrix from the synthetic bam + if test_index == consts.SINGLE_CELL_COUNT_MATRIX: + count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( + test_bam_path, gene_name_to_index + ) + if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX: + count_matrix_from_bam: CountMatrix = CountMatrix.from_sorted_tagged_bam( + test_bam_path, + gene_name_to_index, + chromosomes_gene_locations_extended=chromosomes_gene_locations_extended, + ) + + # load the test counts matrix + _count_matrix_data_expected = sp.csr_matrix(np.load(test_count_matrix_path)) + row_index_expected = np.load(test_row_index_path) + col_index_expected = np.load(test_col_index_path) + + count_matrix_data_expected = CountMatrix( + _count_matrix_data_expected, row_index_expected, col_index_expected + ) + count_matrix_data_expected = count_matrix_data_expected.matrix.todense() + + finally: + _test_temp_dir.cleanup() + + count_matrix_data_from_bam = count_matrix_from_bam.matrix.todense() + row_index_from_bam = count_matrix_from_bam.row_index + col_index_from_bam = count_matrix_from_bam.col_index + + # sort expected and from_bam results by their respective row and column indices, since their sort order + # is not part of the design specs and is considered arbitrary + ( + sorted_count_matrix_data_from_bam, + sorted_row_index_from_bam, + sorted_col_index_from_bam, + ) = _get_sorted_count_matrix( + count_matrix_data_from_bam, row_index_from_bam, col_index_from_bam + ) + ( + sorted_count_matrix_data_expected, + sorted_row_index_expected, + sorted_col_index_expected, + ) = _get_sorted_count_matrix( + count_matrix_data_expected, row_index_expected, col_index_expected + ) + + assert all( + [ + row_name_from_bam == row_name_expected + for row_name_from_bam, row_name_expected in zip( + sorted_row_index_from_bam, sorted_row_index_expected + ) + ] + ) + assert all( + [ + col_name_from_bam == col_name_expected + for col_name_from_bam, col_name_expected in zip( + sorted_col_index_from_bam, sorted_col_index_expected + ) + ] + ) + + assert np.allclose( + sorted_count_matrix_data_from_bam, sorted_count_matrix_data_expected + ) + + +class SyntheticTaggedAlignmentTypeBAMGenerator: + """This class generates a synthetic count matrix and an accompanying synthetic tagged BAM file as + described in the preamble documentation block. + + Parameters + ---------- + num_cells : int + number of real cells + max-genes : int + maximum number of genes to use to generate synthetic counts + chromosomes_gene_exons : Dict[str, Dict[str, List[tuple]]] + keys at the first level refers to chromosome number, keys at the + second level refers to a gene and with the list of exonic regions as values + chromosomes_gene_non_exons : Dict[str, Dict[str, List[tuple]]] + keys at the first level refers to chromosome number, keys at the + second level refers to a gene and with the list of intronic regions as values + + rng_seed : int + random number generator seed + + Methods + ------- + generate_synthetic_bam_and_counts_matrix + generates synthetic test data and writes the output to disk + + See Also + -------- + count.from_sorted_tagged_bam + """ + + OUTPUT_PREFIX = "intronic_" + SYNTHETIC_SEQUENCE_LENGTH = 5 + REFERENCE_SEQUENCE_NAME = "1" + # EXONIC_SEQUENCE_NAME = "EXONIC_SEQUENCE" + SYNTHETIC_SEQUENCE_LENGTH = 100 + + bam_output_filename = OUTPUT_PREFIX + "records.bam" + count_matrix_output_filename = OUTPUT_PREFIX + "count_matrix.npy" + row_index_output_filename = OUTPUT_PREFIX + "_row_index.npy" + col_index_output_filename = OUTPUT_PREFIX + "_col_index.npy" + + def __init__( + self, + num_cells: int, + max_genes: int, + chromosomes_gene_exons: Dict[str, Dict[str, List[tuple]]], + chromosomes_gene_non_exons: Dict[str, List[tuple]], + rng_seed: int = 777, + ) -> None: + self.num_cells = num_cells + + self.chromosomes_gene_exons = chromosomes_gene_exons + self.chromosomes_gene_non_exons = chromosomes_gene_non_exons + + # initialize the random number generator + self.rng: np.random.RandomState = np.random.RandomState(seed=rng_seed) + + # generate gene names + self.all_gene_names = list(self.chromosomes_gene_exons.keys())[:max_genes] + self.num_genes = len(self.all_gene_names) + + self.max_genes = max_genes + assert ( + max_genes <= self.num_genes + ), f"Max genes ({self.max_genes}) must be <= to all annotated genes ({self.num_genes})" + self.to_be_used_gene_indices: List[int] = self.rng.choice( + np.arange(0, self.num_genes, dtype=np.int), + size=self.max_genes, + replace=False, + ).tolist() + self.to_be_used_gene_names = [ + self.all_gene_names[j] for j in self.to_be_used_gene_indices + ] + + def _generate_random_cell_barcode(self, length: int = 16): + return self._generate_random_genomic_sequences(length) + + def _generate_random_molecule_barcode(self, length: int = 10): + return self._generate_random_genomic_sequences(length) + + def _generate_random_genomic_sequences(self, length: int): + return "".join(self.rng.choice(["A", "C", "T", "G"], size=length)) + + def _generate_location_based_tag_list( + self, num_alignments: int, gene_names: List[str], alignment_location: str + ): + alignment_record_tags = [] + for i in range(num_alignments): + alignment_record_tags.append( + AlignmentRecordTags( + self._generate_random_cell_barcode(), + self._generate_random_molecule_barcode(), + gene_names[i], + alignment_location, + ) + ) + + return alignment_record_tags + + def _add_alignment_start_coordinates(self, alignment_tags, alignment_location): + _alignment_tags = [] + + for alignment_tag in alignment_tags: + if alignment_location == "EXONIC": + if alignment_tag.gene_name in self.chromosomes_gene_exons: + coord = self.chromosomes_gene_exons[alignment_tag.gene_name] + setattr(alignment_tag, "coordinate", coord[0][0] + 1) + _alignment_tags.append(alignment_tag) + + if alignment_location == "INTRONIC": + if alignment_tag.gene_name in self.chromosomes_gene_non_exons: + coord = self.chromosomes_gene_non_exons[alignment_tag.gene_name] + if coord: + setattr(alignment_tag, "coordinate", coord[0][0] + 1) + alignment_tag.gene_name = "" + _alignment_tags.append(alignment_tag) + + return _alignment_tags + + def generate_synthetic_bam_and_counts_matrix( + self, + output_path: str, + gene_name_to_index: int, + test_index: int, + alignment_sort_order: bam.AlignmentSortOrder = CellMoleculeGeneQueryNameSortOrder(), + ): + """Generates synthetic count matrix and BAM file and writes them to disk. + + Parameters + ---------- + output_path : str + output path + gene_name_to_index : Dict[str, int] + gene name to an index + test_index : int + 0 for single cell matrix and 1 for single nuclei matrix + alignment_sort_order : bam.AlignmentSortOrder + sort order of BAM alignment records; if 'None', random sort order is implied + + Returns + ------- + None + """ + + gene_names_alignments = [] + + for gene_name in sorted(self.chromosomes_gene_non_exons.keys()): + if self.chromosomes_gene_non_exons[gene_name]: + gene_names_alignments.append(gene_name) + + gene_names: List[int] = [] + cell_ids: List[int] = [] + + records = [] + # Only exons, expected in both single-cell and single-nuclei modes + exonic_alignment_tags = self._generate_location_based_tag_list( + 10, gene_names_alignments[0:], "EXONIC" + ) + exonic_alignment_tags = self._add_alignment_start_coordinates( + exonic_alignment_tags, "EXONIC" + ) + + for i, alignment_tag in enumerate(exonic_alignment_tags): + pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( + alignment_tag, + "EXONIC", + i, + 10, + self.rng, + reference_start=alignment_tag.coordinate, + ) + records.append(pysam_alignment) + gene_names.append(alignment_tag.gene_name) + cell_ids.append(alignment_tag.cell_barcode) + + "Only introns only in single-nuclei mode" + intronic_alignment_tags = self._generate_location_based_tag_list( + 3, gene_names_alignments[10:], "INTRONIC" + ) + intronic_alignment_tags = self._add_alignment_start_coordinates( + intronic_alignment_tags, "INTRONIC" + ) + for i, alignment_tag in enumerate(intronic_alignment_tags): + pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( + alignment_tag, + "INTRONIC", + i + 10, + 10, + self.rng, + reference_start=alignment_tag.coordinate, + ) + records.append(pysam_alignment) + if test_index == consts.SINGLE_NUCLEI_COUNT_MATRIX: + gene_names.append(gene_names_alignments[i + 10]) + cell_ids.append(alignment_tag.cell_barcode) + + "both intron and exons from the same gene in bost single-cell and single-nuclei modes" + exonic_alignment_tags = self._generate_location_based_tag_list( + 10, gene_names_alignments[20:], "EXONIC" + ) + exonic_alignment_tags = self._add_alignment_start_coordinates( + exonic_alignment_tags, "EXONIC" + ) + + _intronic_alignment_tags = self._generate_location_based_tag_list( + 10, gene_names_alignments[20:], "INTRONIC" + ) + intronic_alignment_tags = [] + for intronic_tag, exonic_tag in zip( + _intronic_alignment_tags, exonic_alignment_tags + ): + intronic_tag.cell_barcode = exonic_tag.cell_barcode + intronic_alignment_tags.append(intronic_tag) + intronic_alignment_tags = self._add_alignment_start_coordinates( + intronic_alignment_tags, "INTRONIC" + ) + + for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate( + zip(exonic_alignment_tags, intronic_alignment_tags) + ): + pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( + exonic_alignment_tag, + "EXONINTRONSAME", + i + 20, + 10, + self.rng, + reference_start=exonic_alignment_tag.coordinate, + ) + records.append(pysam_alignment) + + pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( + intronic_alignment_tag, + "EXONINTRONSAME", + i + 20, + 10, + self.rng, + reference_start=intronic_alignment_tag.coordinate, + ) + records.append(pysam_alignment) + cell_ids.append(exonic_alignment_tag.cell_barcode) + gene_names.append(exonic_alignment_tag.gene_name) + + # both intron and exons from separate genes should not appear in single-cell mode + exonic_alignment_tags = self._generate_location_based_tag_list( + 10, gene_names_alignments[30:], "EXONIC" + ) + exonic_alignment_tags = self._add_alignment_start_coordinates( + exonic_alignment_tags, "EXONIC" + ) + + _intronic_alignment_tags = self._generate_location_based_tag_list( + 10, gene_names_alignments[31:], "INTRONIC" + ) + intronic_alignment_tags = [] + for intronic_tag, exonic_tag in zip( + _intronic_alignment_tags, exonic_alignment_tags + ): + intronic_tag.cell_barcode = exonic_tag.cell_barcode + intronic_alignment_tags.append(intronic_tag) + intronic_alignment_tags = self._add_alignment_start_coordinates( + intronic_alignment_tags, "INTRONIC" + ) + + for i, (exonic_alignment_tag, intronic_alignment_tag) in enumerate( + zip(exonic_alignment_tags, intronic_alignment_tags) + ): + pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( + exonic_alignment_tag, + "EXONINTRONSEP", + i + 30, + 10, + self.rng, + reference_start=exonic_alignment_tag.coordinate, + ) + records.append(pysam_alignment) + + pysam_alignment = SyntheticTaggedBAMGenerator._generate_aligned_segment_from_tags( + intronic_alignment_tag, + "EXONINTRONSEP", + i + 30, + 10, + self.rng, + reference_start=intronic_alignment_tag.coordinate, + ) + records.append(pysam_alignment) + + if test_index == consts.SINGLE_CELL_COUNT_MATRIX: + cell_ids.append(exonic_alignment_tag.cell_barcode) + gene_names.append(exonic_alignment_tag.gene_name) + + # write BAM file + with pysam.AlignmentFile( + os.path.join(output_path, self.bam_output_filename), + mode="wb", + reference_names=[self.REFERENCE_SEQUENCE_NAME], + reference_lengths=[self.SYNTHETIC_SEQUENCE_LENGTH], + ) as bo: + for record in records: + bo.write(record) + + n_genes = len(gene_name_to_index) + n_data = len(cell_ids) + # write count matrix, row index, and col index + count_matrix = np.zeros((n_data, n_genes), dtype=np.int32) + for i, (cell_id, gene_name) in enumerate(zip(cell_ids, gene_names)): + count_matrix[i][gene_name_to_index[gene_name]] = 1 + + test_count_matrix_path = os.path.join( + output_path, + SyntheticTaggedAlignmentTypeBAMGenerator.count_matrix_output_filename, + ) + test_row_index_path = os.path.join( + output_path, + SyntheticTaggedAlignmentTypeBAMGenerator.row_index_output_filename, + ) + test_col_index_path = os.path.join( + output_path, + SyntheticTaggedAlignmentTypeBAMGenerator.col_index_output_filename, + ) + + np.save(test_count_matrix_path, count_matrix) + np.save(test_row_index_path, cell_ids) + gene_rank = [(gene, rank) for gene, rank in gene_name_to_index.items()] + gene_rank.sort(key=lambda x: x[1]) + gene_names = [x[0] for x in gene_rank] + np.save(test_col_index_path, gene_names) + + return os.path.join(output_path, self.bam_output_filename) diff --git a/src/sctools/test/test_encodings.py b/src/sctools/test/test_encodings.py index d1d3dd9..1eeb458 100644 --- a/src/sctools/test/test_encodings.py +++ b/src/sctools/test/test_encodings.py @@ -3,23 +3,23 @@ from itertools import combinations -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def sequence(): - return b'ACGTTTGAGATGAGATATAGANNNN' + return b"ACGTTTGAGATGAGATATAGANNNN" -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def encoder_2bit(sequence): length = len(sequence) return encodings.TwoBit(length) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def encoder_3bit(): return encodings.ThreeBit() -@pytest.fixture(scope='module', params=[encodings.TwoBit, encodings.ThreeBit]) +@pytest.fixture(scope="module", params=[encodings.TwoBit, encodings.ThreeBit]) def encoder(request): return request.param @@ -39,34 +39,34 @@ def test_three_bit_encode_decode_produces_same_string(sequence, encoder_3bit): def test_two_bit_encoder_gets_correct_gc_content(encoder_2bit): - sequence_no_n = b'AGCGCGAT' - gc_content = sequence_no_n.count(b'C') + sequence_no_n.count(b'G') + sequence_no_n = b"AGCGCGAT" + gc_content = sequence_no_n.count(b"C") + sequence_no_n.count(b"G") encoded = encoder_2bit.encode(sequence_no_n) assert encoder_2bit.gc_content(encoded) == gc_content def test_three_bit_encoder_gets_correct_gc_content(sequence, encoder_3bit): encoded = encoder_3bit.encode(sequence) - assert encoder_3bit.gc_content(encoded) == sequence.count(b'C') + sequence.count( - b'G' + assert encoder_3bit.gc_content(encoded) == sequence.count(b"C") + sequence.count( + b"G" ) def test_two_bit_throws_errors_when_asked_to_encode_unknown_nucleotide(encoder_2bit): with pytest.raises(KeyError): - encoder_2bit.encode(b'ACGTP') # P is not a valid code + encoder_2bit.encode(b"ACGTP") # P is not a valid code def test_three_bit_encodes_unknown_nucleotides_as_N(encoder_3bit): - encoded = encoder_3bit.encode(b'ACGTP') # P is not a valid code + encoded = encoder_3bit.encode(b"ACGTP") # P is not a valid code decoded = encoder_3bit.decode(encoded) - assert decoded == b'ACGTN' + assert decoded == b"ACGTN" @pytest.fixture def simple_barcodes(): """simple barcode set with min_hamming = 1, max_hamming = 2""" - return [b'ACGT', b'ACGG', b'ACGA', b'ACGC', b'TCGT', b'CCGT', b'GCGT'] + return [b"ACGT", b"ACGG", b"ACGA", b"ACGC", b"TCGT", b"CCGT", b"GCGT"] @pytest.fixture diff --git a/src/sctools/test/test_entrypoints.py b/src/sctools/test/test_entrypoints.py index 36143c3..419b325 100644 --- a/src/sctools/test/test_entrypoints.py +++ b/src/sctools/test/test_entrypoints.py @@ -9,24 +9,24 @@ from sctools import bam, platform, count, consts -data_dir = os.path.split(__file__)[0] + '/data/' +data_dir = os.path.split(__file__)[0] + "/data/" def test_Attach10XBarcodes_entrypoint(): args = [ - '--r1', - data_dir + 'test_r1.fastq', - '--i1', - data_dir + 'test_i7.fastq', - '--u2', - data_dir + 'test.bam', - '--output-bamfile', - 'test_tagged_bam.bam', + "--r1", + data_dir + "test_r1.fastq", + "--i1", + data_dir + "test_i7.fastq", + "--u2", + data_dir + "test.bam", + "--output-bamfile", + "test_tagged_bam.bam", ] rc = platform.TenXV2.attach_barcodes(args) assert rc == 0 - with pysam.AlignmentFile('test_tagged_bam.bam', 'rb', check_sq=False) as f: + with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: for alignment in f: # each alignment should now have a tag, and that tag should be a string assert isinstance( @@ -43,27 +43,27 @@ def test_Attach10XBarcodes_entrypoint(): assert isinstance( alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str ) - os.remove('test_tagged_bam.bam') # clean up + os.remove("test_tagged_bam.bam") # clean up def test_Attach10XBarcodes_entrypoint_with_whitelist(): args = [ - '--r1', - data_dir + 'test_r1.fastq', - '--i1', - data_dir + 'test_i7.fastq', - '--u2', - data_dir + 'test.bam', - '--output-bamfile', - 'test_tagged_bam.bam', - '--whitelist', - data_dir + '1k-august-2016.txt', + "--r1", + data_dir + "test_r1.fastq", + "--i1", + data_dir + "test_i7.fastq", + "--u2", + data_dir + "test.bam", + "--output-bamfile", + "test_tagged_bam.bam", + "--whitelist", + data_dir + "1k-august-2016.txt", ] return_call = platform.TenXV2.attach_barcodes(args) assert return_call == 0 success = False - with pysam.AlignmentFile('test_tagged_bam.bam', 'rb', check_sq=False) as f: + with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: for alignment in f: if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): success = True @@ -83,23 +83,23 @@ def test_Attach10XBarcodes_entrypoint_with_whitelist(): alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str ) assert success - os.remove('test_tagged_bam.bam') # clean up + os.remove("test_tagged_bam.bam") # clean up def test_AttachBarcodes_entrypoint_with_whitelist(): # test of the BarcodePlatform.attach_barcodes entry point with # sample, cell, and molecule barcodes all specified args = [ - '--r1', - data_dir + 'test_r1.fastq', - '--i1', - data_dir + 'test_i7.fastq', - '--u2', - data_dir + 'test.bam', - '--output-bamfile', - 'test_tagged_bam.bam', - '--whitelist', - data_dir + '1k-august-2016.txt', + "--r1", + data_dir + "test_r1.fastq", + "--i1", + data_dir + "test_i7.fastq", + "--u2", + data_dir + "test.bam", + "--output-bamfile", + "test_tagged_bam.bam", + "--whitelist", + data_dir + "1k-august-2016.txt", "--sample-barcode-start-position", "0", "--sample-barcode-length", @@ -117,7 +117,7 @@ def test_AttachBarcodes_entrypoint_with_whitelist(): return_call = platform.BarcodePlatform.attach_barcodes(args) assert return_call == 0 success = False - with pysam.AlignmentFile('test_tagged_bam.bam', 'rb', check_sq=False) as f: + with pysam.AlignmentFile("test_tagged_bam.bam", "rb", check_sq=False) as f: for alignment in f: if alignment.has_tag(consts.CELL_BARCODE_TAG_KEY): success = True @@ -138,33 +138,33 @@ def test_AttachBarcodes_entrypoint_with_whitelist(): alignment.get_tag(consts.QUALITY_SAMPLE_BARCODE_TAG_KEY), str ) assert success - os.remove('test_tagged_bam.bam') # clean up + os.remove("test_tagged_bam.bam") # clean up def test_split_bam(): tag_args = [ - '--r1', - data_dir + 'test_r1.fastq', - '--i1', - data_dir + 'test_i7.fastq', - '--u2', - data_dir + 'test.bam', - '--output-bamfile', - 'test_tagged_bam.bam', - '--whitelist', - data_dir + '1k-august-2016.txt', + "--r1", + data_dir + "test_r1.fastq", + "--i1", + data_dir + "test_i7.fastq", + "--u2", + data_dir + "test.bam", + "--output-bamfile", + "test_tagged_bam.bam", + "--whitelist", + data_dir + "1k-august-2016.txt", ] platform.TenXV2.attach_barcodes(tag_args) split_args = [ - '--bamfile', - 'test_tagged_bam.bam', - '--output-prefix', - 'test_tagged', - '--subfile-size', - '0.005', - '--tags', + "--bamfile", + "test_tagged_bam.bam", + "--output-prefix", + "test_tagged", + "--subfile-size", + "0.005", + "--tags", consts.CELL_BARCODE_TAG_KEY, consts.RAW_CELL_BARCODE_TAG_KEY, ] @@ -172,17 +172,17 @@ def test_split_bam(): return_call = platform.GenericPlatform.split_bam(split_args) assert return_call == 0 - for f in glob.glob('test_tagged*'): + for f in glob.glob("test_tagged*"): os.remove(f) def test_tag_sort_bam(): args = [ - '-i', - data_dir + 'unsorted.bam', - '-o', - 'test_sorted.bam', - '-t', + "-i", + data_dir + "unsorted.bam", + "-o", + "test_sorted.bam", + "-t", consts.CELL_BARCODE_TAG_KEY, consts.GENE_NAME_TAG_KEY, consts.MOLECULE_BARCODE_TAG_KEY, @@ -196,28 +196,28 @@ def test_tag_sort_bam(): consts.GENE_NAME_TAG_KEY, consts.MOLECULE_BARCODE_TAG_KEY, ] - with pysam.AlignmentFile('test_sorted.bam', 'rb') as f: + with pysam.AlignmentFile("test_sorted.bam", "rb") as f: segments = f.fetch(until_eof=True) tag_sortable_records = ( bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments ) bam.verify_sort(tag_sortable_records, tag_keys) - for f in glob.glob('test_sorted*'): + for f in glob.glob("test_sorted*"): os.remove(f) def test_tag_sort_bam_dash_t_specified_multiple_times(): args = [ - '-i', - data_dir + 'unsorted.bam', - '-o', - 'test_sorted.bam', - '-t', + "-i", + data_dir + "unsorted.bam", + "-o", + "test_sorted.bam", + "-t", consts.CELL_BARCODE_TAG_KEY, - '-t', + "-t", consts.GENE_NAME_TAG_KEY, - '-t', + "-t", consts.MOLECULE_BARCODE_TAG_KEY, ] @@ -229,40 +229,40 @@ def test_tag_sort_bam_dash_t_specified_multiple_times(): consts.GENE_NAME_TAG_KEY, consts.MOLECULE_BARCODE_TAG_KEY, ] - with pysam.AlignmentFile('test_sorted.bam', 'rb') as f: + with pysam.AlignmentFile("test_sorted.bam", "rb") as f: segments = f.fetch(until_eof=True) tag_sortable_record_generator = ( bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments ) bam.verify_sort(tag_sortable_record_generator, tag_keys) - for f in glob.glob('test_sorted*'): + for f in glob.glob("test_sorted*"): os.remove(f) def test_tag_sort_bam_no_tags(): - args = ['-i', data_dir + 'unsorted.bam', '-o', 'test_sorted.bam'] + args = ["-i", data_dir + "unsorted.bam", "-o", "test_sorted.bam"] return_call = platform.GenericPlatform.tag_sort_bam(args) assert return_call == 0 tag_keys = [] - with pysam.AlignmentFile('test_sorted.bam', 'rb') as f: + with pysam.AlignmentFile("test_sorted.bam", "rb") as f: segments = f.fetch(until_eof=True) tag_sortable_records = ( bam.TagSortableRecord.from_aligned_segment(s, tag_keys) for s in segments ) bam.verify_sort(tag_sortable_records, tag_keys) - for f in glob.glob('test_sorted*'): + for f in glob.glob("test_sorted*"): os.remove(f) def test_verify_bam_sort(): args = [ - '-i', - data_dir + 'cell-gene-umi-queryname-sorted.bam', - '-t', + "-i", + data_dir + "cell-gene-umi-queryname-sorted.bam", + "-t", consts.CELL_BARCODE_TAG_KEY, consts.GENE_NAME_TAG_KEY, consts.MOLECULE_BARCODE_TAG_KEY, @@ -274,9 +274,9 @@ def test_verify_bam_sort(): def test_verify_bam_sort_raises_error_on_unsorted(): args = [ - '-i', - data_dir + 'unsorted.bam', - '-t', + "-i", + data_dir + "unsorted.bam", + "-t", consts.CELL_BARCODE_TAG_KEY, consts.GENE_NAME_TAG_KEY, consts.MOLECULE_BARCODE_TAG_KEY, @@ -293,15 +293,15 @@ def test_count_merge(): matrix = sp.coo_matrix((data, (ind, col)), shape=(10, 10), dtype=np.float32).tocsr() # be lazy and reuse the inds as the col and row index counts = count.CountMatrix(matrix, ind, col) - counts.save(tmp + '/test_input_1') - counts.save(tmp + '/test_input_2') + counts.save(tmp + "/test_input_1") + counts.save(tmp + "/test_input_2") merge_args = [ - '-o', - tmp + '/test_merged_counts', - '-i', - tmp + '/test_input_2', - tmp + '/test_input_1', + "-o", + tmp + "/test_merged_counts", + "-i", + tmp + "/test_input_2", + tmp + "/test_input_1", ] return_call = platform.GenericPlatform.merge_count_matrices(merge_args) assert return_call == 0 diff --git a/src/sctools/test/test_gtf.py b/src/sctools/test/test_gtf.py index 49661bd..fd74ea9 100644 --- a/src/sctools/test/test_gtf.py +++ b/src/sctools/test/test_gtf.py @@ -3,42 +3,42 @@ from itertools import chain import pytest -_data_dir = os.path.split(__file__)[0] + '/data' -_files = ['%s/%s' % (_data_dir, f) for f in ('test.gtf', 'test.gtf.gz', 'test.gtf.bz2')] +_data_dir = os.path.split(__file__)[0] + "/data" +_files = ["%s/%s" % (_data_dir, f) for f in ("test.gtf", "test.gtf.gz", "test.gtf.bz2")] -@pytest.fixture(scope='module', params=_files) +@pytest.fixture(scope="module", params=_files) def files(request): """returns a filename""" return request.param def test_opens_file_reads_first_line(files): - rd = gtf.Reader(files, 'r', header_comment_char='#') + rd = gtf.Reader(files, "r", header_comment_char="#") record = next(iter(rd)) assert isinstance(record, gtf.GTFRecord) def test_opens_file_populates_fields_properly(files): - rd = gtf.Reader(files, 'r', header_comment_char='#') + rd = gtf.Reader(files, "r", header_comment_char="#") record = next(iter(rd)) - assert record.seqname == 'chr19' - assert record.chromosome == 'chr19' - assert record.source == 'HAVANA' - assert record.feature == 'gene' + assert record.seqname == "chr19" + assert record.chromosome == "chr19" + assert record.source == "HAVANA" + assert record.feature == "gene" assert record.start == 60951 assert record.end == 71626 - assert record.score == '.' - assert record.strand == '-' - assert record.frame == '.' + assert record.score == "." + assert record.strand == "-" + assert record.frame == "." expected_features = { - 'gene_id': 'ENSG00000282458.1', - 'gene_type': 'transcribed_processed_pseudogene', - 'gene_status': 'KNOWN', - 'gene_name': 'WASH5P', - 'level': '2', - 'havana_gene': 'OTTHUMG00000180466.8', + "gene_id": "ENSG00000282458.1", + "gene_type": "transcribed_processed_pseudogene", + "gene_status": "KNOWN", + "gene_name": "WASH5P", + "level": "2", + "havana_gene": "OTTHUMG00000180466.8", } assert record._attributes == expected_features @@ -49,21 +49,21 @@ def test_opens_file_populates_fields_properly(files): def test_set_attribute_verify_included_in_output_string(files): - rd = gtf.Reader(files, 'r', header_comment_char='#') + rd = gtf.Reader(files, "r", header_comment_char="#") record = next(iter(rd)) - record.set_attribute('test_attr', 'foo') - assert record.get_attribute('test_attr') == 'foo' + record.set_attribute("test_attr", "foo") + assert record.get_attribute("test_attr") == "foo" # verify in output string - assert 'foo' in str(record) + assert "foo" in str(record) def test_opens_file_parses_size(files): - rd = gtf.Reader(files, 'r', header_comment_char='#') + rd = gtf.Reader(files, "r", header_comment_char="#") record = next(iter(rd)) assert 71626 - 60951 == record.size # mangle record, make sure error is raised record._fields[3:5] = [record.end, record.start] with pytest.raises(ValueError): - getattr(record, 'size') + getattr(record, "size") diff --git a/src/sctools/test/test_platform.py b/src/sctools/test/test_platform.py index c80c1f7..e18e0cd 100644 --- a/src/sctools/test/test_platform.py +++ b/src/sctools/test/test_platform.py @@ -4,7 +4,7 @@ from .. import platform -data_dir = os.path.split(__file__)[0] + '/data/' +data_dir = os.path.split(__file__)[0] + "/data/" def test_attach_barcodes(): @@ -13,15 +13,15 @@ def test_attach_barcodes(): temp_dir_name = tempfile.mkdtemp() # Construct cli arguments to pass to the command - temp_output_bam = temp_dir_name + 'output.bam' + temp_output_bam = temp_dir_name + "output.bam" args = [ "--r1", - data_dir + 'test_r1.fastq', + data_dir + "test_r1.fastq", "--u2", - data_dir + 'test_r2.bam', + data_dir + "test_r2.bam", "--i1", - data_dir + 'test_i1.fastq', + data_dir + "test_i1.fastq", "--o", temp_output_bam, "--sample-barcode-start-pos", @@ -42,12 +42,12 @@ def test_attach_barcodes(): with pysam.AlignmentFile(temp_output_bam, "rb", check_sq=False) as samfile: for read in samfile: - tag_cr = read.get_tag('CR') - tag_cy = read.get_tag('CY') - tag_ur = read.get_tag('UR') - tag_uy = read.get_tag('UY') - tag_sr = read.get_tag('SR') - tag_sy = read.get_tag('SY') + tag_cr = read.get_tag("CR") + tag_cy = read.get_tag("CY") + tag_ur = read.get_tag("UR") + tag_uy = read.get_tag("UY") + tag_sr = read.get_tag("SR") + tag_sy = read.get_tag("SY") assert len(tag_cr) == 16 assert len(tag_cy) == 16 assert len(tag_ur) == 4