Skip to content

Commit

Permalink
Added a unit test that reproduces the expected count matrix for snRNA…
Browse files Browse the repository at this point in the history
…-Seq mode in CreateCountMatrix (#71)

* intial commit

* snRNA-Seq unit test in pytest working

* fixed the pytest for snRNA functionallity

* corrected the chromosome-gene-exon from gene_locations

* Update count.py

* addressed the review comments and formatted code

* Update count.py

Co-authored-by: Nikolas Barkas <nikolas.barkas@outlook.com>
  • Loading branch information
kishorikonwar and barkasn committed Feb 26, 2020
1 parent a4bd5e6 commit a3ec39d
Show file tree
Hide file tree
Showing 10 changed files with 756 additions and 146 deletions.
5 changes: 5 additions & 0 deletions src/sctools/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,8 @@

MAX_BAM_SPLIT_SUBFILES_TO_WARN = 500
MAX_BAM_SPLIT_SUBFILES_TO_RAISE = 1000


# modes of the count matrix runs
SINGLE_CELL_COUNT_MATRIX = 0
SINGLE_NUCLEI_COUNT_MATRIX = 1
21 changes: 15 additions & 6 deletions src/sctools/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def from_sorted_tagged_bam(
cls,
bam_file: str,
gene_name_to_index: Dict[str, int],
gene_locations: Dict[str, List[tuple]] = None,
chromosomes_gene_locations_extended: Dict[str, List[tuple]] = None,
cell_barcode_tag: str = consts.CELL_BARCODE_TAG_KEY,
molecule_barcode_tag: str = consts.MOLECULE_BARCODE_TAG_KEY,
gene_name_tag: str = consts.GENE_NAME_TAG_KEY,
Expand Down Expand Up @@ -172,7 +172,7 @@ def from_sorted_tagged_bam(
bam_file : str
input bam file marked by cell barcode, molecule barcode, and gene ID tags sorted in that
order
gene_locations : dict
chromosomes_gene_locations_extended : dict
Location of genes by chromosome
(default = None)
cell_barcode_tag : str, optional
Expand Down Expand Up @@ -251,22 +251,31 @@ def from_sorted_tagged_bam(
) in grouped_records_generator:

# modify alignments to include the gene name to the alignments to INTRONIC regions
if gene_locations:
if chromosomes_gene_locations_extended:
alignments = []
for alignment in input_alignments:
if alignment.has_tag("XF"):
aln_type = alignment.get_tag("XF")
if (
alignment.reference_name
and aln_type == "INTRONIC"
and alignment.reference_name in gene_locations
and alignment.reference_name
in chromosomes_gene_locations_extended
):
gene_name = cls.binary_overlap(
gene_locations[alignment.reference_name],
chromosomes_gene_locations_extended[
alignment.reference_name
],
0,
len(gene_locations[alignment.reference_name]) - 1,
len(
chromosomes_gene_locations_extended[
alignment.reference_name
]
)
- 1,
alignment.reference_start,
)

if gene_name:
alignment.set_tag("GE", gene_name)
alignments.append(alignment)
Expand Down
53 changes: 53 additions & 0 deletions src/sctools/gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,56 @@ def extract_extended_gene_names(
# Sort by starting location
gene_locations[chromosome].sort(key=lambda x: x[0])
return gene_locations


def extract_gene_exons(
files: Union[str, List[str]] = "-", mode: str = "r", header_comment_char: str = "#"
) -> Dict[str, List[tuple]]:
"""Extract extended gene names from GTF file(s) and returns a map from gene names to the the
list of exons in the ascending order of the start positions file(s).
Parameters
----------
files : Union[str, List], optional
File(s) to read. If '-', read sys.stdin (default = '-')
mode : {'r', 'rb'}, optional
Open mode. If 'r', read strings. If 'rb', read bytes (default = 'r').
header_comment_char : str, optional
lines beginning with this character are skipped (default = '#')
Returns
-------
Dict[str, List[tuple]]
A dictionary of chromosome names mapping to a List of tuples, each containing
a the exons in the ascending order of the start positions.
Dict[str, List(Tuple((start,end), gene)))
"""
gene_name_to_start_end = dict()
for record in Reader(files, mode, header_comment_char).filter(
retain_types=["exon"]
):
gene_name = record.get_attribute("gene_name")
if gene_name is None:
raise ValueError(
f"Malformed GTF file detected. Record is of type gene but does not have a "
f'"gene_name" field: {record}'
)
if record.chromosome not in gene_name_to_start_end:
gene_name_to_start_end[record.chromosome] = dict()

if gene_name not in gene_name_to_start_end[record.chromosome]:
gene_name_to_start_end[record.chromosome][gene_name] = []

gene_name_to_start_end[record.chromosome][gene_name].append(
(record.start, record.end)
)

gene_locations_exons = dict()
# For each chromosome invert the map to be in List[( (start,end), genename )] and sort it by start
for chromosome in gene_name_to_start_end:
gene_locations_exons[chromosome] = [
(locs, key) for key, locs in gene_name_to_start_end[chromosome].items()
]
# Sort by starting location
gene_locations_exons[chromosome].sort(key=lambda x: x[0])
return gene_locations_exons
2 changes: 1 addition & 1 deletion src/sctools/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int:
matrix = count.CountMatrix.from_sorted_tagged_bam(
bam_file=args.bam_file,
gene_name_to_index=gene_name_to_index,
gene_locations=gene_locations,
chromosomes_gene_locations_extended=gene_locations,
cell_barcode_tag=args.cell_barcode_tag,
molecule_barcode_tag=args.molecule_barcode_tag,
gene_name_tag=args.gene_name_tag,
Expand Down
3 changes: 1 addition & 2 deletions src/sctools/test/test_barcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,7 @@ def tagged_bamfile():
outbam,
]
platform.TenXV2.attach_barcodes(args)
yield outbam
os.remove(outbam)
return outbam


def test_correct_bam_produces_cb_tags(tagged_bamfile, truncated_whitelist_from_10x):
Expand Down
Loading

0 comments on commit a3ec39d

Please sign in to comment.