# revisiting_threeNucl_mappy.ipynb
## Marcus Viscardi,    June 29, 2023

So it sounds like we are hitting a wall with getting minimap2 to align the cDNA libraries from TAD treated libraries (specifically w/ 3nt genomes?).

Some of the libraries (best ones):
` /data16/marcus/working/230613_nanoporeRun_sMV025-RNAStds_50-50_LT_TAD_Nano3P `
` /data16/marcus/working/230612_nanoporeRun_sMV025-RNAStds_50-50_LT_MockTAD_Nano3P `

In [54]:
import mappy
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from pprint import pprint

from Bio import SeqIO, Seq, SeqRecord
import pysam

import nanoporePipelineCommon as npC

print(f"Imports done at {npC.get_dt(for_print=True)}")

Imports done at 06/29/23 @ 04:01:22 PM


# Paths:

In [10]:
# Libraries
treated_parent = Path('/data16/marcus/working/230613_nanoporeRun_sMV025-RNAStds_50-50_LT_TAD_Nano3P/output_dir')
mock_parent = Path('/data16/marcus/working/230612_nanoporeRun_sMV025-RNAStds_50-50_LT_MockTAD_Nano3P/output_dir')

treated_fastq = treated_parent / 'cat_files' / 'cat.fastq'
mock_fastq = mock_parent / 'cat_files' / 'cat.fastq'

treated_bam = treated_parent / 'cat_files' / 'cat.sorted.mappedAndPrimary.bam'
mock_bam = mock_parent / 'cat_files' / 'cat.sorted.mappedAndPrimary.bam'

# Genome
genome_parent = Path('/data16/marcus/genomes/plus_cerENO2_elegansRelease100')
genome_cDNA_fasta = genome_parent / '230327_allcDNA_plus-cerENO2.cdna.all.fa'
genome_DNA_fasta = genome_parent / '230327_allChrs_plus-cerENO2.allChrs.fa'
genome_gtf = genome_parent / '230327_allChrs_plus-cerENO2.gtf'
genome_parsed_gtf = genome_parent / '230327_allChrs_plus-cerENO2.gtf.parquet'
genome_bed = genome_parent / '230327_allChrs_plus-cerENO2.bed'

# Outputs
fastx_files = Path('/data16/marcus/scripts/nanoporePipelineScripts/mappyWithExtendedNuclCode/fastxFiles')
genome_files = Path('/data16/marcus/scripts/nanoporePipelineScripts/mappyWithExtendedNuclCode/genomeFiles')

# Make 3NT reads:

Going to just run with ChrIII because it is the shortest chromosome (@ 13,783,801bp)

I'll also use the BAM file to just keep reads that had an alignment to the genome on ChrIII, so we don't have to trudge through all the reads for testing.
***
**@Liam:** You can probably skip this cell and the next. Didn't take very long to run, but the outputs are already made. Feel free to edit and run if you want to mess with it thou.

In [29]:

target_bam = treated_bam
lib_name = target_bam.parent.parent.parent.stem
target_chr = 'III'

# Read BAM and hold onto any reads that had previous hit the target_chr (ChrIII)
output_rec_list = []
for read in pysam.Samfile(target_bam, 'rb').fetch(target_chr):
    seq = Seq.Seq(read.seq)
    rec = SeqRecord.SeqRecord(seq, id=read.qname, description='')
    output_rec_list.append(rec)
# Save to a fasta file
SeqIO.write(output_rec_list, fastx_files / f'{lib_name}_Chr{target_chr}-Hits.4nucl.fasta', 'fasta')

# Now make the modified versions:
mod_GtoT_rec_list = []
mod_CtoA_rec_list = []
for rec in output_rec_list:
    GtoT = str(rec.seq).replace('G', 'T')
    CtoA = str(rec.seq).replace('C', 'A')  # This one doesn't really make sense... but might as well be thorough
    mod_GtoT_rec_list.append(SeqRecord.SeqRecord(Seq.Seq(GtoT), id=rec.id, description=''))
    mod_CtoA_rec_list.append(SeqRecord.SeqRecord(Seq.Seq(CtoA), id=rec.id, description=''))
# Save to a fasta file
SeqIO.write(mod_GtoT_rec_list, fastx_files / f'{lib_name}_Chr{target_chr}-Hits.3nucl.GtoT.fasta', 'fasta')
SeqIO.write(mod_CtoA_rec_list, fastx_files / f'{lib_name}_Chr{target_chr}-Hits.3nucl.CtoA.fasta', 'fasta')

390

# Create the modified "genomes"
(These "genomes" are just ChrIII)

In [72]:
# Edit the genome to only include ChrIII and make modified versions:
# First lets just save a copy of ChrIII and a rev_comp version in our working dir
genome = SeqIO.to_dict(SeqIO.parse(genome_DNA_fasta, 'fasta'))
chrIII = genome['III']
SeqIO.write([chrIII], genome_files / 'III.4nucl.fa', 'fasta')
chrIII_rev_comp = chrIII.reverse_complement()
chrIII_rev_comp.id = chrIII.id + '.rev_comp'
chrIII_rev_comp.description = ' '.join(chrIII_rev_comp.description.split(' ')[1:])
SeqIO.write([chrIII_rev_comp], genome_files / 'III.4nucl.rev_comp.fa', 'fasta')

# Now lets make the modified versions:
# G to T
chrIII_3nucl_GtoT = SeqRecord.SeqRecord(Seq.Seq(str(chrIII.seq).replace('G', 'T')),
                                        id=chrIII.id + '.GtoT',
                                        description=' '.join(chrIII.description.split(' ')[1:]))
SeqIO.write([chrIII_3nucl_GtoT], genome_files / 'III.3nucl.GtoT.fa', 'fasta')

# G to T rev_comp
chrIII_3nucl_GtoT_rev_comp = SeqRecord.SeqRecord(Seq.Seq(str(chrIII.seq).replace('G', 'T')).reverse_complement(),
                                                 id=chrIII.id + '.GtoT',
                                                 description=' '.join(chrIII.description.split(' ')[1:]))
SeqIO.write([chrIII_3nucl_GtoT_rev_comp], genome_files / 'III.3nucl.GtoT.rev_comp.fa', 'fasta')

# C to A
chrIII_3nucl_CtoA = SeqRecord.SeqRecord(Seq.Seq(str(chrIII.seq).replace('C', 'A')),
                                        id=chrIII.id + '.CtoA',
                                        description=' '.join(chrIII.description.split(' ')[1:]))
SeqIO.write([chrIII_3nucl_CtoA], genome_files / 'III.3nucl.CtoA.fa', 'fasta')

# C to A rev_comp
chrIII_3nucl_CtoA_rev_comp = SeqRecord.SeqRecord(Seq.Seq(str(chrIII.seq).replace('C', 'A')).reverse_complement(),
                                                 id=chrIII.id + '.CtoA',
                                                 description=' '.join(chrIII.description.split(' ')[1:]))
SeqIO.write([chrIII_3nucl_CtoA_rev_comp], genome_files / 'III.3nucl.CtoA.rev_comp.fa', 'fasta')

1

# Build the mappy aligners
We can make one for each modified and reverse complimented genome

[Link to SUPER SHORT Mappy documentation](https://pypi.org/project/mappy/)

In [86]:
mappy_aligner_dict = {}
for genome_path in genome_files.iterdir():
    if genome_path.suffix == '.fa':
        mappy_aligner_dict[genome_path.stem] = mappy.Aligner(str(genome_path),
                                                             preset='map-ont',  # avoiding using the splice options for now
                                                             best_n=1,  # only keeps one hit (no secondary's)
                                                             n_threads=16,
                                                             extra_flags=0x100000,
                                                             #           0x100000 = FWD alignments only
                                                             #           0x200000 = REV alignments only
                                                             )

# Start aligning!?
***
**@Liam**: This is probably where you can dig in a bit with some diagnostics

In [88]:
target_fasta = fastx_files / f'{lib_name}_Chr{target_chr}-Hits.4nucl.fasta'
#                            f'{lib_name}_Chr{target_chr}-Hits.3nucl.GtoT.fasta'
#                            f'{lib_name}_Chr{target_chr}-Hits.3nucl.CtoA.fasta'

hit_count = 0
for read in SeqIO.parse(target_fasta, 'fasta'):
    print(read.id)
    for genome_name, aligner in mappy_aligner_dict.items():
        for hit in aligner.map(str(read.seq), cs=True):
            print(f"{genome_name:<30}{hit}")  # TODO: A bit more here would allow for a lot of diagnostics! Good for Liam.
            hit_count += 1
    print()

print(f'Hit count: {hit_count}')

5756fbbe-b929-437c-a0d9-e91f3f2c79e5
III.4nucl                     34	271	+	III	13783801	41734	41975	211	248	57	tp:A:P	ts:A:.	cg:Z:91M2I11M1I33M3D60M4I5M6D4M2D26M	cs:Z::4*ag:24*ag:8*ag:9*ag:10*ag:11*ag:2*ag:11*ag:1*at*cg:1+tt:11+g:1*ag:1*ag*ag:28-gcc:4*ag:1*tc:51*ag:1+gcgc:5-atacaa:4-ag:5*ag:3*ag:10*ag:5

c9ab481d-b1ef-49f8-b518-ab457b05b889

24fd8eef-c233-4031-83fc-c4d19abb845e
III.4nucl                     98	304	+	III	13783801	599511	599718	190	207	60	tp:A:P	ts:A:.	cg:Z:126M1D80M	cs:Z::12*tc*tc*tc:5*tc:7*tc*tc:9*tc*tc:5*tc:1*tc:4*tc:10*tc:20*tc:31*tc:8-c:17*tc:18*tc:43

68b7241d-bda1-4a4f-9b1c-aa3b9bc06506

0c4ec0fc-5bde-42f3-9297-d403e454c7ff
III.4nucl                     1	429	+	III	13783801	785640	786084	366	451	40	tp:A:P	ts:A:.	cg:Z:54M1D5M5D6M1D19M2D12M1I8M3D26M1D19M1D11M1D93M2D19M3I2M1D11M1I31M2D24M1D10M2D21M1I12M1I38M	cs:Z::10*ag:5*ag*ag*ag*ag:1*ag:23*ct:1*tc:3*ag:2-c:5-ttccc:6-a:1*ag*tc:14*ta:1-ga:1*ag:6*ag:3+t:1*ag:6-tta:2*ag:4*ag*ac:3*tc:2*ct:10-t:11*ag:2*ag*tc:3-a*ag:1*ag