# comparingTailLengths.ipynb
## Marcus Viscardi,    January 16, 2023

General goal to just look at the tail length distibutions of a couple genes.

I think *odc-1* is of particular interest b/c it has only one isoform, but that one isoform looks NMD sensitive! This would mean that all the reads mapping are likely relevant to NMD. The counter-example would be a gene like *ubl-1* in which ~most of the reads are not related to NMD, and I have no good way to parse out which are which.

**update 1/17/23:** So we seem to have a problem with the really low number of adapted reads for basically all genes, even in N2 libraries. The two pilot libs seem to have way better depth and adapted species coverage... For example. In my newN2 lib, *odc-1* only has **4** adapted reads!

In [None]:
import os
import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

import pandas as pd
import seaborn as sea
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = "browser"

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

In [None]:
try:
    reads_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.reads_df.parquet"))
    compressed_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.compressed_df.parquet"))
    conversion_dict = {"xrn-1-5tera": "oldN2",
                       "xrn-1-5tera-smg-6": "oldS6",
                       "5tera_xrn-1-KD_wt": "newN2",
                       "5tera_xrn-1-KD_smg-5": "newS5",
                       "5tera_xrn-1-KD_smg-6": "newS6",
                       "5tera_xrn-1-KD_smg-7": "newS7",
                       }
    reads_df_genes.replace(conversion_dict, inplace=True)
    compressed_df_genes.replace(conversion_dict, inplace=True)
    print(f"Finished library loading at {npCommon.get_dt(for_print=True)}")
except ValueError:
    reads_df_genes, compressed_df_genes = None, None
    print("Could not find pre-compressed dataframes saved. Try running the first few cells of initialTestingAndScratchPaper.ipynb")

In [None]:
compressed_df_genes.query("gene_name == 'ets-4'")[["lib", 'gene_id', 'gene_name', 't5', 'gene_hits', 'mean_polya_length', 'median_polya_length']]

In [None]:
target_gene = 'F19B2.5'

for lib in ['oldN2', 'newN2', 'newS5']:
    plot_df = reads_df_genes.query(f"gene_name == '{target_gene}'").query(f"lib == '{lib}'")
    
    fig = px.ecdf(plot_df,
                  color='t5',
                  x='polya_length',
                  ecdfnorm='percent',
                  marginal="rug")
    fig.update_layout(title=f"{target_gene} for {lib}")
    fig.show()

In [None]:
# target_genes = ['odc-1', 'F19B2.5', 'rpl-3', 'rpl-7A', 'rpl-30', 'ubl-1', 'nduo-5']
# target_genes = ['rpl-7A', 'rpl-12', 'rpl-1', 'rpl-3']
target_genes = ['rpl-26', 'odc-1', 'rpl-3', 'rpl-30', 'ets-4']
target_libs = [
    'oldN2',
    'newN2',
    #'newS5',
    #'newS6',
    #'oldS6',
]

for lib in target_libs:
    plot_df = reads_df_genes.query(f"lib == '{lib}'")
    sea.set()
    sea.set_style("whitegrid")
    fig, axes = plt.subplots(1, len(target_genes), figsize=(2.5*len(target_genes), 4),
                             sharey='all',
                             #sharex='all',
                             )
    fig.suptitle(f"eCDF plots of tail length from {lib} library")
    
    for i, target_gene in enumerate(target_genes):
        subplot_df = plot_df.query(f"gene_name == '{target_gene}'")
        sea.ecdfplot(ax=axes[i],
                     data=subplot_df,
                     x='polya_length',
                     hue='t5',
                     palette={'-': 'black',
                              '+': 'red'},
                     linewidth=3)
        axes[i].set_title(f"{target_gene} in {lib}")
        axes[i].legend(title=f"5TERA Adapted",
                       loc="lower right",
                       labels=[f"No (n={subplot_df[subplot_df.t5 == '-'].count()['polya_length']}/{subplot_df[subplot_df.t5 == '-'].count()['lib']})", f"Yes (n={subplot_df[subplot_df.t5 == '+'].count()['polya_length']}/{subplot_df[subplot_df.t5 == '+'].count()['lib']})"])
    plt.tight_layout()
    save_path = f"./output_files/tail_lengths/{npCommon.get_dt()}_tailLengthCDFs_{lib}"
    for file_type in ['.svg', '.png']:
        plt.savefig(save_path + file_type,
                    dpi=300)
    plt.show()

In [None]:
# See sticky note above my bench...

gs_kw = dict(width_ratios=[1, 3, 2], height_ratios=[1, 1, 2, 1, 1, 2,])

fig, axdict = plt.subplot_mosaic([['left', 'upper top', 'upper right'],
                                  ['left', 'upper middle', 'upper right'],
                                  ['left', 'upper bottom', 'upper right'],
                                  ['left', 'lower top', 'lower right'],
                                  ['left', 'lower middle', 'lower right'],
                                  ['left', 'lower bottom', 'lower right'],
                                 ],
                                 gridspec_kw=gs_kw,
                                 # constrained_layout=True,
                                 figsize=(5, 7),
                                 )
axdict['left']
# plt.tight_layout()

In [None]:
isinstance(axdict, plt.Axes)

In [None]:
plot_df.query("gene_name == 'ets-4'")

In [None]:
reads_df_genes.query("gene_name == 'ets-4'")