# 221117_comparing_Nano3P-v-dRNA_Stds.ipynb
## Marcus Viscardi,    November 17, 2022

In [2]:
import mappy as mp
import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
from nanoporePipelineCommon import *

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

import numpy as np
import pandas as pd
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

print("imports done.")

imports done.


In [65]:
# Doing this as a class makes a lot more sense... I should do this in the future!
class StdLibrary:
    def __init__(self,
                 assignment_csv: str,
                 is_dRNA_not_Nano3P: bool,
                 ):
        self.raw_df = pd.read_csv(assignment_csv)
        self.df = self.raw_df.copy(deep=True)
        self.df['read_id'] = self.df["Unnamed: 0"]
        
        self.is_dRNA = is_dRNA_not_Nano3P
        self.is_Nano3P = not is_dRNA_not_Nano3P
        
        if self.is_dRNA:
            self.df.sequence = self.df.sequence.str.replace("U", "T")
        
        def mappy_cols_to_cols(mappy_obj_column):
            """
            The main reason to have this as its own script is to handle the errors that come from failed splits (which result from failed maps!)
            
            :param mappy_obj_column: 
            :return: 
            """
            if isinstance(mappy_obj_column, str):
                return_values = mappy_obj_column.split("\t")
            else:
                return_values = [None for _ in range(13)]
            if len(return_values) == 13:
                return return_values
            else:
                print(return_values)
        
        tqdm.pandas(desc="Extracting information from Mappy column")
        self.df[['q_st', 'q_en', 'strand', 'ctg', 'ctg_len', 'r_st', 'r_en', 'mlen', 'blen', 'mapq', 'tp', 'ts', 'cigar']] = self.df.progress_apply(lambda row: mappy_cols_to_cols(row['mappy_hit_obj']), axis=1, result_type='expand')
        self.hit_df = self.df[self.df.assignment.isin(['00', '05', '10', '15', '30', '60'])]
        self.hit_df['cigar'] = self.hit_df.cigar.str.rsplit(":", 1).str[1]
        self.hit_df = self.hit_df.astype({"r_st": int, "r_en": int, "q_st": int, "q_en": int})
        
        barcode_dict = { '00': 'GGTGTTGTT', '05': 'CGGCAATAA', '10': 'TAATCGTCC', '15': 'CCTTCTAGG', '30': 'ACACACACC', '60': 'AAGAGGAGG'}
        if self.is_Nano3P:
            # We'll flip (rev. comp.) the barcode dict if we are working with nano3P data (cDNA sequences)
            barcode_dict = {key: mp.revcomp(value) for key, value in barcode_dict.items()}
        def check_for_perfect_matches(rc_barcode_dict, **row):
            assignment = row['assignment']
            sequence = row['sequence']
            barcode = rc_barcode_dict[assignment]
            perfect_barcode = barcode in sequence
            return perfect_barcode
        self.hit_df['perfect_barcode'] = self.hit_df.apply(lambda row: check_for_perfect_matches(barcode_dict, **row), axis=1)
        path_to_std_ref_fasta = "220902_version2.0_releventSequences_wOutTails.fasta"
        self.stds_ref_dict = {}
        for ref_id, sequence, _, comments in mp.fastx_read(path_to_std_ref_fasta, read_comment=True):
            cutdown_ref_id = ref_id[-6:-4]
            self.stds_ref_dict[cutdown_ref_id] = sequence
    
    def plot_perfect_barcodes(self, renderer=None, save_to=None):
        hit_df_groupby = self.hit_df.groupby("assignment")

        grouped_df = hit_df_groupby['assignment'].count().to_frame(name="total_count")
        grouped_df['perfect_match_count'] = hit_df_groupby['perfect_barcode'].sum().to_frame(name="perfect_match_count")
        grouped_df['mean_r_en'] = hit_df_groupby['r_en'].mean().to_frame(name="mean_r_en")
        
        grouped_df['imperfect_match_count'] = grouped_df.total_count - grouped_df.perfect_match_count
        grouped_df = grouped_df.reset_index()
        fig = px.bar(grouped_df,
                     x='assignment',
                     y=['perfect_match_count', 'imperfect_match_count'],
                     template="plotly_white")
        fig.update_layout(height=500, width=700)
        fig.update_layout(legend=dict(
            orientation='h',
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        ))
        if isinstance(renderer, str):
            fig.show(renderer=renderer)
        else:
            fig.show()
        if isinstance(save_to, str):
            if save_to.endswith("html"):
                fig.write_html(save_to)
            else:
                fig.write_image(save_to)
    
    def print_alignments(self,
                         number_to_print: int,
                         select_assignment=None,
                         constrict_to_width=None,
                         select_with_head=False,
                         additional_filtering_query=None):
        def print_mappy_hit_alignment_for_stds(read_id, cigar, r_st, r_en, q_st, q_en, strand, sequence, assignment,
                                               ref_dict=None, line_print_width=None,
                                               **other_row_items) -> None:
            import re
            ref_seq = ref_dict[assignment].upper()
            print(f"\nread_id={read_id}; assignment={assignment}")
            parsed_cigar = re.findall(rf'(\d+)([MDNSIX])', cigar)
            parsed_cigar = [(int(num), char) for num, char in parsed_cigar]
            ref_seq = ref_seq[r_st: r_en]
            ref_pos = 0
            sequence = sequence[q_st: q_en]
            read_pos = 0
            if strand == "-":
                sequence = mp.revcomp(sequence)
            top_line = ""
            middle_line = ""
            bottom_line = ""
            for length, code in parsed_cigar:
                if code == "M":  # Map (Read & Ref Match)
                    read_map_piece = sequence[read_pos:read_pos + length]
                    ref_map_piece = ref_seq[ref_pos:ref_pos + length]
                    perfect_matches = ""
                    for index, char in enumerate(read_map_piece):
                        try:
                            if char == ref_map_piece[index]:
                                perfect_matches += "|"
                            else:
                                perfect_matches += "•"
                        except IndexError:
                            perfect_matches += " "
                    top_line += read_map_piece
                    middle_line += perfect_matches
                    bottom_line += ref_map_piece
                    ref_pos += length
                    read_pos += length
                elif code == "I":  # Insert (Gap in Ref)
                    top_line += sequence[read_pos:read_pos + length]
                    middle_line += " " * length
                    bottom_line += " " * length
                    read_pos += length
                elif code == "D" or code == "N":  # Delete (Gap in Read)
                    top_line += " " * length
                    middle_line += " " * length
                    bottom_line += ref_seq[ref_pos:ref_pos + length]
                    ref_pos += length
            if isinstance(line_print_width, int):
                num_blocks = int(np.ceil(len(top_line) / line_print_width))
                print_blocks = []
                for block_index in range(num_blocks):
                    print_blocks.append([
                        top_line[block_index * line_print_width:(block_index + 1) * line_print_width],
                        middle_line[block_index * line_print_width:(block_index + 1) * line_print_width],
                        bottom_line[block_index * line_print_width:(block_index + 1) * line_print_width],
                    ])
                for top, mid, bot in print_blocks:
                    print()
                    print(f"Read: {top}")
                    print(f"      {mid}")
                    print(f"Ref:  {bot}")
            else:
                print(f"Read: {top_line}", f"      {middle_line}", f"Ref:  {bottom_line}", sep='\n')
        
        if isinstance(select_assignment, str):
            print_df = self.hit_df[self.hit_df.assignment == select_assignment]
        else:
            print_df = self.hit_df
        
        if isinstance(additional_filtering_query, str):
            print_df = print_df.query(additional_filtering_query)
        
        if select_with_head:
            print_df = print_df.head(number_to_print)
        else:
            print_df = print_df.sample(number_to_print)
        
        if isinstance(constrict_to_width, int):
            print_df.apply(lambda row: print_mappy_hit_alignment_for_stds(ref_dict=self.stds_ref_dict, line_print_width=constrict_to_width, **row), axis=1)
        else:
            print_df.apply(lambda row: print_mappy_hit_alignment_for_stds(ref_dict=self.stds_ref_dict, line_print_width=175, **row), axis=1)

In [66]:
nano3P = StdLibrary("221031_RNAStds_Nano3P_Assignments.csv", False)
dRNA = StdLibrary("221114_RNAStds_dRNA_Assignments.csv", True)


Extracting information from Mappy column: 100%|██████████| 247906/247906 [00:08<00:00, 29029.97it/s] 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Extracting information from Mappy column: 100%|██████████| 51618/51618 [00:01<00:00, 32147.99it/s] 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [68]:
nano3P.plot_perfect_barcodes(save_to=f"{get_dt(for_file=True)}_nano3P_barcodeCounts.png")
dRNA.plot_perfect_barcodes(save_to=f"{get_dt(for_file=True)}_dRNA_barcodeCounts.png")

In [52]:
dRNA.print_alignments(50,
                      constrict_to_width=200,
                      select_assignment=None,
                      select_with_head=False,
                      additional_filtering_query="perfect_barcode == True")


read_id=b2e0be4c-4a50-498a-a102-7eadd4fec716; assignment=05

Read: AGATTGTGTTTGTTAGTTCGCTGTGGATGGGTAAGGGTGTTATGAACGTTGTCAACAACGTCAACAACGTCATTGTTGCTGCTTTCGTCAAGGCCAACCTAGATGTTAAGGACCTTAAAAGGCCGTCGATGA   CTTGTTGTCTTT GACGGTACCGCCAACTAGTCCAAGTTGGGTGCTAACGCTACATATTTGGGTG
      |||||||||||||||| |||||||||||||||||||||||||||||||•||||||||||||||||||||||||||•||||||||||||||||||||||||||||||||||||||  ||||||||||||||||   |||||||||||| ||•||||||||||||•|||||||||||||||||||||||  || •|||||||
Ref:  AGATTGTGTTTGTTAG TCGCTGTGGATGGGTAAGGGTGTTATGAACGCTGTCAACAACGTCAACAACGTCATTGCTGCTGCTTTCGTCAAGGCCAACCTAGATGTTAAGGACC  AAAAGGCCGTCGATGACTTCTTGTTGTCTTTGGATGGTACCGCCAACAAGTCCAAGTTGGGTGCTAACGCT  AT CTTGGGTG

Read: TCTCCATATGGCCGCTGCTAGAGCCGCTGCTGCTGCTGAAAAGAACGTTCCCATTGTACCAACATATTGGCTGACTTGTCTCACCAAGTCCAAGACCTCTCCATACGTTTTGCCAGTTCCA TCTTGAACG TTTGAACGGTGGTTCCCACGCTGGTGGTGCTTTGGCTTTGCAAGAA TCATGATTGCTCCAACTGGTG
      |||||  ||||||||||||||||||   ||||||||||||||||||| ||||||||||||||||| |||||||||||||||    ||||||||||||||||||

In [73]:
plot_df = dRNA.hit_df.sort_values("assignment")  #[dRNA.hit_df.perfect_barcode]
fig = px.box(plot_df,
             x='assignment',
             y='r_en',
             range_y=[642,665],
             # points="outliers",
             )
fig.update_layout(height=500, width=700)
fig.add_hline(662)
fig.show(renderer="firefox")

In [69]:
dRNA.hit_df.shape

(44209, 19)

In [70]:
nano3P.hit_df.shape

(208876, 19)

In [72]:
nano3P_groupby = nano3P.df.groupby("assignment")
dRNA_groupby = dRNA.df.groupby("assignment")
# Make a groupby, get counts for perfect barcodes and total counts
# Merge those by assignment
# Plot those as X and Y axes