In [None]:
import sys
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

from tqdm.notebook import tqdm

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

import numpy as np
import pandas as pd
import statistics as stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

CONVERSION_DICT = {"xrn-1-5tera": "oldN2",
                   "xrn-1-5tera-smg-6": "oldS6",
                   "5tera_xrn-1-KD_wt": "newN2",
                   "5tera_xrn-1-KD_smg-5": "newS5",
                   "5tera_xrn-1-KD_smg-6": "newS6",
                   "5tera_xrn-1-KD_smg-7": "newS7",
                   "5tera_xrn-1-KD_wt_rerun": "newerN2",
                   "5tera_xrn-1-KD_smg-6_rerun": "newerS6",
                   "5tera_xrn-1-KD_smg-5_rerun": "newerS5",
                   "sPM57": "sPM57",
                   "sPM58": "sPM58",
                   }
REV_CONVERSION_DICT = {val: key for key, val in CONVERSION_DICT.items()}

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

In [None]:
regenerate = False
libs_to_load = sorted({
    'oldN2',
    'newN2',
    'newerN2',
    'oldS6',
    'newS6',
    'newerS6',
    'newS5',
    'newerS5',
    # 'newS7',
})

try:
    if regenerate:
        raise ValueError
    
    reads_df_raw_path = npCommon.find_newest_matching_file(f"./output_files/mega_merge_parquets/*_{'-'.join(libs_to_load)}_merged5TERA.reads_df.parquet")
    compressed_df_genes_raw_path = npCommon.find_newest_matching_file(f"./output_files/mega_merge_parquets/*_{'-'.join(libs_to_load)}_merged5TERA.compressed_df.parquet")
    print(f"Found preprocessed files at:\n\t{reads_df_raw_path}\nand:\n\t{compressed_df_genes_raw_path}")

    reads_df_genes_raw = pd.read_parquet(reads_df_raw_path)
    compressed_df_genes_raw = pd.read_parquet(compressed_df_genes_raw_path)
except ValueError:
    print(f"Could not find preprocessed files matching these libs: {'/'.join(libs_to_load)}\nGoing to create new ones from scratch! This will take longer.")
    reads_df_genes_raw, compressed_df_genes_raw = npCommon.load_and_merge_lib_parquets([REV_CONVERSION_DICT[lib] for lib in libs_to_load],
                                                                                       drop_sub_n=1,
                                                                                       add_tail_groupings=False,
                                                                                       drop_failed_polya=False,
                                                                                       group_by_t5=True,
                                                                                       use_josh_assignment=False)
    print(f"Saving new parquets to speed up future runs.")
    reads_df_genes_raw.to_parquet(f"./output_files/mega_merge_parquets/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_merged5TERA.reads_df.parquet")
    compressed_df_genes_raw.to_parquet(f"./output_files/mega_merge_parquets/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_merged5TERA.compressed_df.parquet")
print(f"Lib load done @ {npCommon.get_dt(for_print=True)}")

In [None]:
compressed_df = compressed_df_genes_raw.copy()
reads_df = reads_df_genes_raw.copy().replace({'lib': CONVERSION_DICT})
print(f"Dataframe copies done @ {npCommon.get_dt(for_print=True)}")

In [None]:
conversion_dict = CONVERSION_DICT
ans = [y for x, y in compressed_df.groupby(['lib', 't5'], as_index=False)]
df_dict = {}
for i, df in enumerate(ans):
    lib = df.lib.unique()[0]
    t5 = df.t5.unique()[0]
    df = df[["chr_id", "gene_id", "gene_name", "gene_hits", "gene_rpm"]]
    df = df.rename(columns={col: f'{col}_{conversion_dict[lib]}_t5{t5}' for col in df.columns if col not in ["chr_id", "gene_id", "gene_name"]})
    df_dict[(conversion_dict[lib], t5)] = df.set_index(["chr_id", "gene_id", "gene_name"])
    # print((conversion_dict[lib], t5))
    # print(df_dict[(conversion_dict[lib], t5)].query("gene_name == 'rpl-12'"))

wide_compressed_df = pd.concat(df_dict.values(), axis=1, join='outer').fillna(0)
wide_compressed_df

In [None]:
reads_df

In [None]:
lib_order = [
    "oldN2",
    # "newN2",
    "newerN2",
    "oldS6",
    # "newS6",
    "newerS6",
    # "newS5",
    "newerS5",
]
plot_df = reads_df.query("lib in @lib_order")
plot_df.sort_values(by='lib', key=lambda x: x.map({v: k for k, v in enumerate(lib_order)}), inplace=True)

# Violin with Seaborn comparing newerN2 and newerS6
fig, ax = plt.subplots(figsize=(10, 10))
sea.boxplot(data=plot_df, x="lib", y="polya_length",
            hue="t5", ax=ax)
ax.set_ylim(0, 250)
plt.savefig(f"./output_files/global_tails/{npCommon.get_dt()}_libsAndT5_boxPlot.png")
plt.show()

fig, ax = plt.subplots(figsize=(10, 10))
sea.violinplot(data=plot_df, x="lib", y="polya_length",
               hue="t5", ax=ax,
               split=True, inner="quartile", cut=0)
ax.set_ylim(0, 250)
plt.show()

# Let's print the number of observations for each category:
for i, t5 in enumerate(plot_df.t5.unique()):
    for j, lib in enumerate(plot_df.lib.unique()):
        category_count = plot_df.query(f"t5 == '{t5}' & lib == '{lib}'").shape[0]
        print(f"{lib:>7}; t5{t5} has {category_count:>8,} observations")


In [None]:
def plot_violins_and_ecdf(plot_df, title_str, save_prefix=None, ymax=250):
    print(f"Plotting {title_str}...")
    for lib_name in plot_df.lib.unique():
        num_obs = plot_df.query(f'lib == @lib_name').shape[0]
        num_adapt = plot_df.query(f'lib == @lib_name & t5 == "+"').shape[0]
        print(f"\t{lib_name} has {num_obs} observations ({num_adapt/num_obs*100:.2f}% of which are adapted)")
        
    # Make Violin Plot
    fig, ax = plt.subplots(figsize=(8, 4))
    sea.violinplot(data=plot_df, x="lib", y="polya_length", hue="t5", ax=ax, split=True, inner="quartile", cut=0)
    if isinstance(ymax, int):
        ax.set_ylim(0, ymax)
    if isinstance(title_str, str):
        ax.set_title(title_str)
    plt.tight_layout()
    if isinstance(save_prefix, str):
        plt.savefig(f"./output_files/global_tails/{npCommon.get_dt()}_{save_prefix}_violinPlot.png", dpi=300)
    plt.show()
    
    # Make ECDF Facet Plots
    grid = sea.FacetGrid(plot_df, col="lib", height = 4, aspect=2 / len(filtered_reads_df.lib.unique()))
    grid.map_dataframe(sea.ecdfplot, x="polya_length", hue="t5")
    grid.set_axis_labels("Poly(A) tail length", "Cumulative distribution")
    grid.add_legend()
    if isinstance(ymax, int):
        grid.set(xlim=(0, 250), ylim=(0, 1))
    if isinstance(title_str, str):
        grid.set_titles(title_str)
    plt.tight_layout()
    if isinstance(save_prefix, str):
        plt.savefig(f"./output_files/global_tails/{npCommon.get_dt()}_{save_prefix}_ecdfPlot.png", dpi=300)
    plt.show()

In [None]:
# Now lets make these plot with the individual genes of interest from the Fisher's tests
# First let's load those:
path_to_fishers = "/data16/marcus/scripts/nanoporePipelineScripts/fourLibrary5TERA/output_files/230613_outputs/230613_merged_fishers.csv"
fishers_df = pd.read_csv(path_to_fishers, index_col=0)
fishers_short_df = fishers_df.query("newerN2_v_newerS5_fishers_significant & newerN2_v_newerS6_fishers_significant")
filtered_reads_df = reads_df.query("gene_name in @fishers_short_df.gene_name")

# Violin with Seaborn with facets for each gene, comparing and the + or - for t5:
sea.set_style("whitegrid")
lib_order = [
    "oldN2",
    # "newN2",
    "newerN2",
    "oldS6",
    # "newS6",
    "newerS6",
    # "newS5",
    "newerS5",
]
interest_plot_df = filtered_reads_df.query("lib in @lib_order")
interest_plot_df.sort_values(by='lib', key=lambda x: x.map({v: k for k, v in enumerate(lib_order)}), inplace=True)

for gene_name in interest_plot_df.gene_name.unique():
    per_gene_plot_df = interest_plot_df.query(f"gene_name == '{gene_name}'")
    #plot_violins_and_ecdf(per_gene_plot_df, f'Gene-{gene_name}', save_prefix=f'Gene-{gene_name}', ymax=250)

# Aggregate of genes of interest:
plot_violins_and_ecdf(interest_plot_df, "interestGenes", "interestGenes", ymax=250)

# Now for the global:
global_plot_df = reads_df.query("lib in @lib_order")
global_plot_df.sort_values(by='lib', key=lambda x: x.map({v: k for k, v in enumerate(lib_order)}), inplace=True)
plot_violins_and_ecdf(global_plot_df, "allGenes", "allGenes", ymax=250)

# Let's try an do this a little cleaner to maybe just make a publishable figure out the gate:

What I really want to show for this is just newerN2 vs newerS6 with the t5 split. I think

In [None]:
plot_libs = [
    # "oldN2",
    "newerN2",
    # "oldS6",
    "newerS6",
    "newerS5",
]

sea.set_style("whitegrid")
fig, axes = plt.subplots(2, len(plot_libs), figsize=((7/3)*len(plot_libs), 7), dpi=300)
# First in the left plots lets show all genes:
all_genes_read_df = reads_df.query("lib in @plot_libs")
all_genes_read_df.sort_values(by='lib', key=lambda x: x.map({v: k for k, v in enumerate(plot_libs)}), inplace=True)
interest_genes_read_df = filtered_reads_df.query("lib in @plot_libs & gene_name in @fishers_short_df.gene_name")
interest_genes_read_df.sort_values(by=['lib', 't5'], key=lambda x: x.map({v: k for k, v in enumerate(plot_libs)}), ascending=[True, True], inplace=True)

for row_i, sub_plot_df in enumerate([all_genes_read_df, interest_genes_read_df]):
    for col_i, lib in enumerate(plot_libs):
        sea.violinplot(data=sub_plot_df.query(f"lib == @lib"),
                       y="polya_length",
                       x="lib",
                       hue="t5",
                       ax=axes[row_i][col_i],
                       split=True,
                       inner="quartile",
                       bw=0.25,
                       cut=0,
                       )
        axes[row_i][col_i].set_ylim(-5, 250)
        if row_i == 0:
            all_or_interest = 'All'
            axes[row_i][col_i].set_title(f"All genes\n({lib})")
        else:
            axes[row_i][col_i].set_title(f"Interest genes\n({lib})")
            all_or_interest = 'Interest'
        axes[row_i][col_i].set_xlabel("")
        axes[row_i][col_i].set_xticklabels([])
        if col_i != 0:
            axes[row_i][col_i].set_ylabel("")
        else:
            axes[row_i][col_i].set_ylabel("Poly(A) Tail Length (nt)")
        # Add annotation for the number of genes in each category:
        sub_plot_adapted_df = sub_plot_df.query(f"lib == @lib & t5 == '+'")
        plus_genes = len(sub_plot_adapted_df)
        sub_plot_unadapted_df = sub_plot_df.query(f"lib == @lib & t5 == '-'")
        minus_genes = len(sub_plot_unadapted_df)
        
        print(f"{lib} ({all_or_interest}), t5+: Median {sub_plot_adapted_df.polya_length.median():.2f}, Mean {sub_plot_adapted_df.polya_length.mean():.2f}, t5-: Median {sub_plot_unadapted_df.polya_length.median():.2f}, Mean {sub_plot_unadapted_df.polya_length.mean():.2f}")
        axes[row_i][col_i].text(0.5, 0.9,
                                f"{'Cleaved:':>13}      {'Full-Length:':<13}\n"
                                f"{plus_genes:>13,}      {minus_genes:<13,}",
                                size='small',
                                horizontalalignment='center',
                                verticalalignment='center',
                                transform=axes[row_i][col_i].transAxes,
                                )
        if col_i == len(plot_libs)-1 and row_i == 1:
            axes[row_i][col_i].legend(loc='center right',
                  # bbox_to_anchor=(0.6, 0.95),
                  # frameon=False,
                  )
        else:
            axes[row_i][col_i].get_legend().remove()

plt.tight_layout()
plt.savefig(f"./output_files/global_tails/{npCommon.get_dt()}_allGenes-v-interestGenes_violinPlot.png", dpi=300)
plt.savefig(f"/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python/230711_coverageAndTails_plots/AllGenes/{npCommon.get_dt()}_allGenes-v-interestGenes_violinPlot.svg")
plt.show()

In [None]:
# I want to do basically the exact same plot as up above, but this time with ECDFs
sea.set_style("whitegrid")
fig, axes = plt.subplots(2, len(plot_libs), figsize=((7/3)*len(plot_libs), 7), dpi=300)
for row_i, sub_plot_df in enumerate([all_genes_read_df, interest_genes_read_df]):
    for col_i, lib in enumerate(plot_libs):
        sea.ecdfplot(data=sub_plot_df.query(f"lib == @lib"),
                     x="polya_length",
                     hue="t5",
                     ax=axes[row_i][col_i],
                     # stat="proportion",
                     )
        axes[row_i][col_i].set_ylim(-0.05, 1.05)
        axes[row_i][col_i].set_xlim(-5, 250)
        if row_i == 0:
            axes[row_i][col_i].set_title(f"All genes\n({lib})")
            axes[row_i][col_i].set_xlabel("")
        else:
            axes[row_i][col_i].set_title(f"Interest genes\n({lib})")
            axes[row_i][col_i].set_xlabel("Poly(A) Tail Length (nt)")
        if col_i != 0:
            axes[row_i][col_i].set_ylabel("")
        else:
            axes[row_i][col_i].set_ylabel("Proportion of Reads")
        if col_i == len(plot_libs)-1 and row_i == 1:
            pass
            # axes[row_i][col_i].legend(loc='lower right')
        else:
            axes[row_i][col_i].get_legend().remove()
plt.tight_layout()
plt.savefig(f"./output_files/global_tails/{npCommon.get_dt()}_allGenes-v-interestGenes_ecdfPlot.png", dpi=300)
plt.show()

In [None]:
from pathlib import Path

def plots_tails_ecdf(library_df,
                     lib_target,
                     save_dir=f"./output_files/isoform_plots/NMD_and_Adapted_tailPlots",
                     given_ax=None,
                     log_xaxis=True,
                     ):
    if given_ax:
        ax = given_ax
    else:
        _, ax = plt.subplots(figsize=(8, 6))
    
    
    g = sea.ecdfplot(library_df,
                     ax=ax,
                     x='polya_length',
                     hue='t5',
                     log_scale=log_xaxis,
                     )
    if log_xaxis:
        ax.set_xbound(10, 200)
        ax.set_xticks([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200])
        ax.set_xticklabels([10, 20, 30, 40, 50, None, 70, None, None, 100, 200])
    else:
        ax.set_xbound(-1, 200)
    # sea.move_legend(g, "lower right")
    ax.get_legend().remove()
    if not given_ax:
        plt.tight_layout()
    if isinstance(save_dir, str):
        save_dir = Path(save_dir)
        if not save_dir.exists():
            print(f"Making new directory at: {save_dir}")
            save_dir.mkdir()
        
        save_path = str(save_dir) + f"/{npCommon.get_dt()}_AllGenes_{lib_target}_tailLength.ecdf"
        if log_xaxis:
            save_path += ".logAxis"
        for file_type in ['.svg', '.png']:
            plt.savefig(save_path + file_type, dpi=300)
    if not given_ax:
        plt.show()

sea.set_style("whitegrid")
# sea.set_context('talk')

for lib in ['newerN2', 'newerS6', 'newerS5']:
    plots_tails_ecdf(sub_plot_df.query(f"lib == @lib"),
                     lib,
                     save_dir="/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python/230711_coverageAndTails_plots/AllGenes")

In [None]:
sea.violinplot(sub_plot_df.query(f"lib == 'newerN2'"),
               hue='t5',
               x='lib',
               y='polya_length',
               inner='quartile',
               orient='v',
               split=True,
               )