Skip to content

Commit

Permalink
sort fasta and make split-seq fwd-stranded
Browse files Browse the repository at this point in the history
  • Loading branch information
Yenaled committed Nov 21, 2023
1 parent 6662460 commit 0be6ede
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 9 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,6 @@ dmypy.json

# Pyre type checker
.pyre/
.vscode
.vscode
ngs_tools/.DS_Store
.DS_Store
2 changes: 1 addition & 1 deletion ngs_tools/chemistry/SingleCellChemistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def whitelist_path(self) -> str:
name='SPLiT-seq',
description='Rosenberg et al. 2018',
n=2,
strand=SequencingStrand.UNSTRANDED,
strand=SequencingStrand.FORWARD,
cdna_parser=SubSequenceParser(SubSequenceDefinition(0)),
cell_barcode_parser=SubSequenceParser(
SubSequenceDefinition(1, 10, 8),
Expand Down
21 changes: 14 additions & 7 deletions ngs_tools/fasta/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,23 @@ def split_genomic_fasta_to_cdna(
Returns:
Path to written FASTA
"""
gene_infos_sorted = {k: gene_infos[k] for k in sorted(gene_infos)}
with Fasta(fasta_path, 'r') as f_in, Fasta(out_path, 'w') as f_out:
for entry in progress(f_in, desc='Splitting cDNA',
disable=not show_progress):
# Find all gene and transcripts in this chromosome
_gene_infos = {}
_transcript_infos = {}
for gene_id, gene_attributes in gene_infos.items():
_tx_infos = {}
for gene_id, gene_attributes in gene_infos_sorted.items():
if gene_attributes['chromosome'] == entry.name:
_gene_infos[gene_id] = gene_attributes
_transcript_infos.update({
_tx_infos.update({
transcript_id: transcript_infos[transcript_id]
for transcript_id in gene_attributes['transcripts']
})

_transcript_infos = {k: _tx_infos[k] for k in sorted(_tx_infos)}

# Write all transcripts as separate FASTA entries.
for transcript_id, transcript_attributes in _transcript_infos.items(
):
Expand Down Expand Up @@ -98,20 +101,23 @@ def split_genomic_fasta_to_intron(
Returns:
Path to written FASTA
"""
gene_infos_sorted = {k: gene_infos[k] for k in sorted(gene_infos)}
with Fasta(fasta_path, 'r') as f_in, Fasta(out_path, 'w') as f_out:
for entry in progress(f_in, desc='Splitting introns',
disable=not show_progress):
# Find all gene and transcripts in this chromosome
_gene_infos = {}
_transcript_infos = {}
for gene_id, gene_attributes in gene_infos.items():
_tx_infos = {}
for gene_id, gene_attributes in gene_infos_sorted.items():
if gene_attributes['chromosome'] == entry.name:
_gene_infos[gene_id] = gene_attributes
_transcript_infos.update({
_tx_infos.update({
transcript_id: transcript_infos[transcript_id]
for transcript_id in gene_attributes['transcripts']
})

_transcript_infos = {k: _tx_infos[k] for k in sorted(_tx_infos)}

# Write all transcripts as separate FASTA entries.
for transcript_id, transcript_attributes in _transcript_infos.items(
):
Expand Down Expand Up @@ -169,12 +175,13 @@ def split_genomic_fasta_to_nascent(
Returns:
Path to written FASTA
"""
gene_infos_sorted = {k: gene_infos[k] for k in sorted(gene_infos)}
with Fasta(fasta_path, 'r') as f_in, Fasta(out_path, 'w') as f_out:
for entry in progress(f_in, desc='Splitting nascent',
disable=not show_progress):
# Find all genes in this chromosome
_gene_infos = {}
for gene_id, gene_attributes in gene_infos.items():
for gene_id, gene_attributes in gene_infos_sorted.items():
if gene_attributes['chromosome'] == entry.name:
_gene_infos[gene_id] = gene_attributes
gene_name = gene_attributes.get('gene_name')
Expand Down

0 comments on commit 0be6ede

Please sign in to comment.