# plottingNewAndOldReads.ipynb
## Marcus Viscardi,    January 16, 2023

This script is going to take the methods from finalizingReadAndCoveragePlotting_matplotlib.py and use those to plot the new reads!

In [None]:
import os
import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon
from nanoporeReadPlotting.finalizingReadAndCoveragePlotting_matplotlib import plot_reads, coverage_plotting_5tera

import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

In [None]:
try:
    reads_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.reads_df.parquet"))
    compressed_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.compressed_df.parquet"))
    conversion_dict = {"xrn-1-5tera": "oldN2",
                       "xrn-1-5tera-smg-6": "oldS6",
                       "5tera_xrn-1-KD_wt": "newN2",
                       "5tera_xrn-1-KD_smg-5": "newS5",
                       "5tera_xrn-1-KD_smg-6": "newS6",
                       "5tera_xrn-1-KD_smg-7": "newS7",
                       "sPM57": "sPM57",
                       "sPM58": "sPM58",
                       }
    reads_df_genes.replace(conversion_dict, inplace=True)
    compressed_df_genes.replace(conversion_dict, inplace=True)
    print(f"Finished library loading at {npCommon.get_dt(for_print=True)}")
except ValueError:
    reads_df_genes, compressed_df_genes = None, None
    print("Could not find pre-compressed dataframes saved. Try running the first few cells of initialTestingAndScratchPaper.ipynb")

# Plotting of reads and coverage for specific genes
Some genes of interest:
- ubl-1
  - ubl-1 has been a classic gene for me. It has one NMD sensitive isoform that is obviously stabilized with the inclusion of NMD-breaking alleles
- rpl-30
  - rpl-30 is pretty interesting as it seems to have one NMD sensitive isoform that is related to a longer 5' UTR region. Maybe this is due to a uORF? This gene might also be one of the annotated NMD targets in elegans.
  - This isoform is identified as Y106G6H.3c, its is annotated as protein coding, but the cleavages all seem to be in the 5'UTR! very likely uORF, I need to check the literature on the NMD-sensitivity of this allele!
- rpl-7A
  - Alt. 5'SS in the third intron produces a NMD sensitive isoform that is stabilized by NMD inhibition. This is a "known" NMD target in worms.
- F19B2.5
  - Seems like there is only one isoform that is NMD sensitive for F19B2.5, interestingly this gene shows a pretty dramatic increase in RPM with NMD-breaking alleles.
- rpl-3
  - This gene has an alt. 3'SS in the first intron that seems to produce an NMD sensitive isoform. It is stabilized with NMD inhibition. This isoform is F13B10.2b, which is annotated as having no CDS and being non-productive.
- Y73B3A.18
- rpl-26
  - Has a really obvious NMD targeted isoform that we can zoom in on

In [None]:
parissas_parsed_gtf_path = "/data16/parissa/genomes/221213_210414_srf1004/210414_srf1004Genome_221213mod.gtf.parquet"

target_genes_plus = [  # This now also includes the divide factors and the adapted adjustment factors
                     # ('odc-1', 50, 5),
                     # ('F19B2.5', 50, 5),
                     # ('rpl-3', 250, 5),
                     # ('rpl-7A', 200, 5),
                     # ('rpl-30', 250, 5),
                     # ('ubl-1', 400, 15),
                     # ('Y73B3A.18', 100, 5),
                     # From Q.M. Mitrovich & P. Anderson (Genes & Dev., 2000):
                     # ID'ed 4 ribosomal proteins that showed change in "expression" upon NMD knockout:
                     # ('rpl-7A', 100, 5),
                     # ('rpl-1', 100, 5),  # aka rpl-10a!!
                     # ('rpl-12', 100, 5),
                     # ('rpl-3', 250, 5),
                     # From V.S. Muir, A.P.Gash, & P. Anderson (G3, 2018):
                     # Class I (showing up in all analyses)
                     # ('B0495.8', 100, 5),
                     # Some negative controls, random genes:
                     # ('unc-54', 150, 5),
                     # More genes I've found:
                     # ('rpl-26', 150, 25),
                     # ('rps-22', 150, 25),
                     # Parissa's Stuff:
                     # ('unc-54', 150, 25),
                     # ('eef-2', 150, 25),
                     # ('vig-1', 150, 25),
                     # ('car-1', 150, 25),
                     # Some more genes from comparing the fisher's p-value rankings:
                     ('F19B2.5', 0, 0),
                     ('Y73B3A.18', 0, 0),
                     ('tos-1', 0, 0),
                     ('C30E1.9', 0, 0),
                     ('odc-1', 0, 0),
                     ('rpl-5', 0, 0),
                     ('smd-1', 0, 0),
                     ('Y37E3.8', 0, 0),
                     ('lep-5', 0, 0),
                     ]
divide_factor_bases = []  # Moved above # Larger number means less reads plotted! Generally 10-500 works well
adapted_adjustment_factor_bases = []  # Moved above # Larger numbers here mean we'll get proportionally more adapted reads plotted, this is b/c they're so rare normally!

for target_gene, divide_factor_base, adapted_adjustment_factor_base in target_genes_plus:
    
    target_gene_folder_path = f"/home/marcus/Insync/marcus.viscardi@gmail.com/Google Drive/insync_folder/5TERA_ReadsAndTails_Plots/raw_figures_from_python/{npCommon.get_dt()}_plots_from_{target_gene}"
    
    try:
        os.mkdir(target_gene_folder_path)
    except FileExistsError:
        pass
    
    bar_plot_dicts = []
    
    for lib in [
                'newN2',
                'newS5',
                'newS6',
                # 'newS7',
                'oldN2',
                'oldS6',
                # 'sPM57',
                # 'sPM58',
                ]:
        # if lib.startswith('old'):
        #     divide_factor = divide_factor_base * 1
        #     adapted_adjustment_factor =  adapted_adjustment_factor_base / 2
        # else:
        #     divide_factor = divide_factor_base
        #     adapted_adjustment_factor = adapted_adjustment_factor_base
        # Here I use a round about way to pull the RPM information to normalize the number of reads plotted!
        try:
            adapted_rpm = compressed_df_genes.query(f"lib == '{lib}'").query(f"gene_name == '{target_gene}'").query(f"t5 == '+'")[
                'gene_rpm'].values[0]
        except IndexError:
            # An IndexError would indicate that this place in the dataframe doesn't exist, because of the way these frames were made,
            # if no reads match the parameters, the cell never gets made!! so this indicates there are 0 reads of the searched parameters!
            adapted_rpm = 0
        try:
            unadapted_rpm = compressed_df_genes.query(f"lib == '{lib}'").query(f"gene_name == '{target_gene}'").query(f"t5 == '-'")[
                'gene_rpm'].values[0]
        except IndexError:
            unadapted_rpm = 0
        
        sea.set_style("whitegrid")
        
        total_reads_plotted = 8
        
        if lib[3] == 'S':
            num_adapt_to_plot = 1
        if lib[3] == 'N':
            num_adapt_to_plot = 4
        num_unadpt_to_plot = total_reads_plotted - num_adapt_to_plot
        
        # num_adapt_to_plot = int(adapted_rpm / (divide_factor / adapted_adjustment_factor))
        # num_unadpt_to_plot = int(unadapted_rpm / divide_factor)
        
        # print(f"For {lib} we are plotting {num_adapt_to_plot} adapted, and {num_unadpt_to_plot} unadapted reads.\nThis is based on the RPM values of t5+:{adapted_rpm} & t5-:{unadapted_rpm}")
        
        # sea.set_style("white")
        fig, axes = plt.subplots(nrows=3, ncols=1,
                                 height_ratios=(1,2,4),
                                 constrained_layout=True,
                                 figsize=(10, 3),
                                 )
        plt.suptitle(f"Coverage and Reads for {target_gene} from {lib}")
        
        
        coverage_plotting_5tera(reads_df_genes.query(f"lib == '{lib}'"),
                                gene_name=target_gene,
                                # save_dir=target_gene_folder_path, save_suffix=f"_{lib}",
                                rpm_normalize=True,
                                provide_axes=axes[:2],
                                specify_gtf_path=parissas_parsed_gtf_path,
                                )
        
        plot_reads(reads_df_genes.query(f"lib == '{lib}'"), gene_name_to_plot=target_gene,
                   pad_x_axis_bounds_by=50,
                   # subsample_fraction=0.125,
                   t5_pos_count=num_adapt_to_plot, t5_neg_count=num_unadpt_to_plot,
                   only_keep_reads_matched_to_gene=True, plot_width_and_height=(4, 3),
                   # save_dir=target_gene_folder_path, save_suffix=f"_{lib}_subSampledByRPM",
                   provided_axis=axes[-1],
                   )
        fig.supylabel('Reads Per Million')
        plt.tight_layout()
        read_plot_save_path = f"{target_gene_folder_path}/readPlots_{target_gene}_{lib}"
        for file_type in ['.png', '.svg']:
            plt.savefig(read_plot_save_path + file_type,
                        dpi=300)
        
        # Save read information for bar plot!
        bar_plot_dicts.append({"gene_name": target_gene,
                               "lib": lib,
                               "unadapted_rpm": unadapted_rpm,
                               "adapted_rpm": adapted_rpm})
    
    sea.set_style("whitegrid")
    pd.DataFrame(bar_plot_dicts).set_index('lib').plot(kind='bar', stacked=True, color=['k', 'red'])
    bar_plot_save_path = f"{target_gene_folder_path}/barPlot_geneRPMS_{target_gene}"
    plt.title(f"RPMs on {target_gene}")
    for file_type in ['.png', '.svg']:
        plt.tight_layout()
        plt.xticks(rotation=45)
        plt.savefig(bar_plot_save_path + file_type,
                    dpi=300)