# myTest.ipynb
## Marcus Viscardi,    January 26, 2023

So I realized that I was just running ahead on this project without Connor. I want to slow down and do these parts step-by-step with him so that he can learn some of the process. BUT I still want to hold onto that code I wrote while I was running ahead, so I'll move it into here.

In [1]:
import mappy
from random import choices as rand_choices
from random import random, randint
from math import log10

from Bio import SeqIO

import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
from standardsAlignment.version2_mappingStandardsMethod import print_mappy_hit_alignment

In [2]:
# Make new reference with all Ts and Cs collapsed into Ts:

# First read the reference line by line
with open("./testRef.TandC_to_T.fasta", 'w') as new_reference:
    for name, seq, qual, comment in mappy.fastx_read("../testRef.fasta", read_comment=True):
        original_ref_seq = seq
        new_seq = ''.join(['T' if nucl.lower() in ('t', 'c') else nucl for nucl in seq])
        new_reference.write(f">{name}_TandC_to_T\n{new_seq}")

In [7]:
first_read_test = "TAATACGACTCACTATAGGGAGAgccatcagattgtgtttgttagtcgctGTGGATGGGTAAGGGTGTTATGaacgctgtcaacaacgtcaacaacgtcattgctgctgctttcgtcaaggccaacctagatgttaaggaccaaaaggccgtcgatgacttcttgttgtctttggatggtaccgccaacaagtccaagttgggtgctaacgctatcttgggtgtctccatggccgctgctagagccgctgctgctgaaaagaacgtcccattgtaccaacatttggctgacttgtctaagtccaagacctctccatacgttttgccagttccattcttgaacgttttgaacggtggttcccacgctggtggtgctttggctttgcaagaattcatgattgctccaactggtgctaagaccttcgctgaagccatgagaattggttccgaagtttaccacaacttgaagtctttgaccaagaagagatacggtgcttctgccggtaacgtcggtgacgaaggtggtgttgctccaaacattcaaaccgctgaagaagCTTTGGACTTGATTGTTGACGCcatcgtcgtgagtagtgaaccgtaagcaagagaagagNNNNNNNNtcagctcagcGGTGTTGTTgtgctgtgctgtgctgtgct"  # just pulled this sequence from the original reference!

aligner = mappy.Aligner(f"./testRef.TandC_to_T.fasta", n_threads=16,  # Path to reference Fasta
                        preset="map-ont", k=14,  # These are the usual go-tos for mapping ONT dRNA-Seq
                        # extra_flags=extra_mappy_flag,  # From: https://github.com/lh3/minimap2/blob/master/minimap.h
                        # #           0x200000 = forces strand reverse strand alignment (b/c we have cDNA)
                        )

# TAD8A_rate = 0.99
reps = 5
steps = 20

for TAD8A_rate in [(i * (100/steps))/100 for i in range(steps+1)]:
    org_mismatch_rate_list = []
    
    for _ in range(reps):
        modified_read = ''.join([rand_choices(('C', 'T'), weights=(TAD8A_rate, 1-TAD8A_rate), k=1)[0] if nucl.lower() == 't' else nucl for nucl in original_ref_seq])
        
        seq_to_map = ''.join(['T' if nucl.lower() in ('t', 'c') else nucl for nucl in modified_read])
        
        hit_objs = {}
        # Loop through each of the mapping hits generated:
        for hit in aligner.map(seq_to_map, cs='long'):
            hit_objs[hit.ctg] = hit
        
        if hit_objs:
            for contig, hit_obj in hit_objs.items():
                # print(hit_obj)
                # print(f"\n\nW/ T&C->T Collapsed Read and Reference:")
                _, adjusted_middle_line, _ = print_mappy_hit_alignment(hit_obj, seq_to_map, aligner.seq(hit_obj.ctg), line_print_width=100, do_not_print=True)
                
                # print(f"\n\nW/out T&C->T Collapsed Read and Reference (but using the same 'alignment' information):")
                _, original_middle_line, _ = print_mappy_hit_alignment(hit_obj, modified_read, original_ref_seq, line_print_width=100, do_not_print=False)
                
                adj_matches = adjusted_middle_line.count("|")
                adj_mismatches = adjusted_middle_line.count("•")
                
                org_matches = original_middle_line.count("|")
                org_mismatches = original_middle_line.count("•")
                
                org_mismatch_rate = org_mismatches / (org_mismatches + org_matches)
                org_mismatch_rate_list.append(org_mismatch_rate)
        else:
            # This means no map!!
            print("failed map.")
    avg_org_mismatch_rate = sum(org_mismatch_rate_list) / len(org_mismatch_rate_list)
    print(f"Mismatch Rate of: {avg_org_mismatch_rate:>6.2%} when we have a TAD8A modification rate (A->I) of {TAD8A_rate:>3.0%}")




Read: TAATACGACTCACTATAGGGAGAGCCATCAGATTGTGTTTGTTAGTCGCTGTGGATGGGTAAGGGTGTTATGAACGCTGTCAACAACGTCAACAACGTCA
      ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Ref:  TAATACGACTCACTATAGGGAGAGCCATCAGATTGTGTTTGTTAGTCGCTGTGGATGGGTAAGGGTGTTATGAACGCTGTCAACAACGTCAACAACGTCA



Read: TTGCTGCTGCTTTCGTCAAGGCCAACCTAGATGTTAAGGACCAAAAGGCCGTCGATGACTTCTTGTTGTCTTTGGATGGTACCGCCAACAAGTCCAAGTT
      ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Ref:  TTGCTGCTGCTTTCGTCAAGGCCAACCTAGATGTTAAGGACCAAAAGGCCGTCGATGACTTCTTGTTGTCTTTGGATGGTACCGCCAACAAGTCCAAGTT



Read: GGGTGCTAACGCTATCTTGGGTGTCTCCATGGCCGCTGCTAGAGCCGCTGCTGCTGAAAAGAACGTCCCATTGTACCAACATTTGGCTGACTTGTCTAAG
      ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Ref:  GGGTGCTAACGCTATCTTGGGTGTCTCCATGGCCGCTGCTAGAGCCGCTGCTGCTGAAAAGAACGTCCCATTGTACCAACATTTGGCTGACTTGTCTAAG



Read: TCCAAGACCTCTCCATACG

In [74]:
# Lets generate some reads:
test_reads_path = f"./testReads.fromMarcus.withTAD8Aedits.fasta"
base_read = "TAATACGACTCACTATAGGGAGAgccatcagattgtgtttgttagtcgctGTGGATGGGTAAGGGTGTTATGaacgctgtcaacaacgtcaacaacgtcattgctgctgctttcgtcaaggccaacctagatgttaaggaccaaaaggccgtcgatgacttcttgttgtctttggatggtaccgccaacaagtccaagttgggtgctaacgctatcttgggtgtctccatggccgctgctagagccgctgctgctgaaaagaacgtcccattgtaccaacatttggctgacttgtctaagtccaagacctctccatacgttttgccagttccattcttgaacgttttgaacggtggttcccacgctggtggtgctttggctttgcaagaattcatgattgctccaactggtgctaagaccttcgctgaagccatgagaattggttccgaagtttaccacaacttgaagtctttgaccaagaagagatacggtgcttctgccggtaacgtcggtgacgaaggtggtgttgctccaaacattcaaaccgctgaagaagCTTTGGACTTGATTGTTGACGCcatcgtcgtgagtagtgaaccgtaagcaagagaagag"

how_many_to_gen = 25
min_length, max_length = 100, len(base_read)

with open(test_reads_path, 'w') as output_fasta:
    for read_num in range(how_many_to_gen):
        TAD8A_rate = random()
        read_length = randint(min_length, max_length)
        read_start = randint(0, len(base_read) - read_length)
        read_end = read_start + read_length
        modified_read = ''.join([rand_choices(('C', 'T'), weights=(TAD8A_rate, 1-TAD8A_rate), k=1)[0] if nucl.lower() == 't' else nucl for nucl in base_read])
        modified_read = modified_read[read_start:]
        
        # print(f"Mod Rate: {TAD8A_rate:>6.2%}\tRead Start: {read_start:>3}\tRead End: {read_end:>3}\tRead Length: {read_length:>3}")
        output_fasta.write(f">test_read_{read_num+1:0>5} expected_mod_rate={TAD8A_rate:0.5} expected_read_length={read_length} expected_read_start={read_start} expected_read_end={read_end}\n{modified_read}\n")

In [79]:
test_reads = SeqIO.parse(open(test_reads_path), 'fasta')
for read in test_reads:
    print(read.format('fasta-2line'))
    print(len(str(read.seq)))

>test_read_00001 expected_mod_rate=0.7189 expected_read_length=593 expected_read_start=21 expected_read_end=614
GAgccaTcagaTCgCgTCCgCTagCcgcCGTGGATGGGCAAGGGCGCCACGaacgcTgTcaacaacgCcaacaacgTcaCCgcCgcCgcCCTcgCcaaggccaaccCagaCgCCaaggaccaaaaggccgTcgaCgacTCcTCgTCgCcCCCggaCggTaccgccaacaagTccaagCTgggCgcCaacgcCaCcCCgggCgCcCccaTggccgcCgcTagagccgcCgcCgcCgaaaagaacgCcccaTTgCaccaacaTTCggcCgacCTgCcTaagCccaagaccCcCccaCacgTCCTgccagCTccaCCcCCgaacgCTTTgaacggCggTTcccacgcTggCggTgcCCTggcCCCgcaagaaCCcaCgaCCgcCccaacCggTgcCaagaccCCcgcCgaagccaCgagaaCCggTCccgaagCCTaccacaacCCgaagCcCCTgaccaagaagagaCacggCgcTCcCgccggCaacgCcggCgacgaaggCggCgTCgcTccaaacaTCcaaaccgcTgaagaagCCCCGGACCCGACCGCCGACGCcaTcgCcgCgagCagTgaaccgCaagcaagagaagag

594
>test_read_00002 expected_mod_rate=0.67696 expected_read_length=189 expected_read_start=353 expected_read_end=542
TggTTcccacgcCggCggCgcCCCggcCTCgcaagaaCCcaCgaTTgcCccaacTggCgcCaagaccCTcgcTgaagccaTgagaaCCggTCccgaagCCCaccacaacCTgaagTcCCCgaccaagaagagaTacggCgcTCcCgccggTaacgCcggCgacgaaggTggCgC