In [None]:
import sys
import warnings
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

from tqdm.notebook import tqdm

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import dash_bio
pio.renderers.default = "browser"

import numpy as np
import pandas as pd 
import statistics as stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

CONVERSION_DICT = npCommon.CONVERSION_DICT
REV_CONVERSION_DICT = {val: key for key, val in CONVERSION_DICT.items()}

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

In [None]:
output_df_path = "/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python/231010_fishers/231010_statTests_fishers_largeDF.parquet"
output_df_raw = pd.read_parquet(output_df_path)
output_df_raw.head()

In [None]:
lib_comps_run = []
for col in output_df_raw.columns:
    if col.endswith("fishers_significant"):
        lib1, lib2 = col.split("_v_")[0], col.split("_v_")[1].split("_fishers_significant")[0]
        print(lib1, lib2)
        lib_comps_run.append((lib1, lib2))
libs_run = set([lib for lib1, lib2 in lib_comps_run for lib in [lib1, lib2]])

In [None]:
min_cutoff_hits = 10

output_df = output_df_raw.copy()
min_cutoff_masks = [output_df[f'total_gene_hits_{lib}'] >= min_cutoff_hits for lib in libs_run]
sig_cutoff_masks = [output_df[f'{lib1}_v_{lib2}_fishers_significant'] for lib1, lib2 in lib_comps_run]
# output_df = output_df.all(min_cutoff_masks)
output_df[f'passed_{min_cutoff_hits}_hit_cutoff_for_all'] = pd.concat(min_cutoff_masks, axis=1).all(axis=1)  # >= len(min_cutoff_masks)
output_df['num_sig_hits'] = pd.concat(sig_cutoff_masks, axis=1).sum(axis=1)
output_df = output_df[output_df[f'passed_{min_cutoff_hits}_hit_cutoff_for_all']]
output_df.sort_values("num_sig_hits", ascending=False)

In [None]:
for lib1, lib2 in lib_comps_run:
    test_col = output_df[f'total_gene_rpm_{lib2}'] / output_df[f'total_gene_rpm_{lib1}']
    new_col_name = f'log2FC_RPM_{lib2}_v_{lib1}'
    output_df[new_col_name] = np.log2(test_col)
    print(new_col_name)
plot_df = output_df[[col for col in output_df.columns if "log2FC_RPM" in col or col == "num_sig_hits"]]
plot_df.reset_index(inplace=True)
plot_df.drop(columns=["chr_id", 'gene_id'], inplace=True)

In [None]:
sig_cutoff_num = 0
filtered_plot_df = plot_df.query(f"num_sig_hits >= {sig_cutoff_num}").set_index("gene_name").drop(columns=["num_sig_hits"])
sea.set_theme(style="whitegrid")
sea.clustermap(filtered_plot_df,
               center=0,
               cmap="vlag",
               figsize=(5, 10))
plt.show()

In [None]:
dash_bio.Clustergram(
    data=filtered_plot_df,
    column_labels=list(filtered_plot_df.columns.values),
    row_labels=list(filtered_plot_df.index),
    height=1000,
    width=700,
)

In [None]:
lib_comps_run

In [None]:
px.scatter(filtered_plot_df,
           x="log2FC_RPM_newerS6_v_newerN2",
           y="log2FC_RPM_newerS5_v_newerN2",
           )