In [None]:
import sys
import warnings
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

from tqdm.notebook import tqdm

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

import numpy as np
import pandas as pd
import statistics as stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

CONVERSION_DICT = {"xrn-1-5tera": "oldN2",
                   "xrn-1-5tera-smg-6": "oldS6",
                   "5tera_xrn-1-KD_wt": "newN2",
                   "5tera_xrn-1-KD_smg-5": "newS5",
                   "5tera_xrn-1-KD_smg-6": "newS6",
                   "5tera_xrn-1-KD_smg-7": "newS7",
                   "5tera_xrn-1-KD_wt_rerun": "newerN2",
                   "5tera_xrn-1-KD_smg-6_rerun": "newerS6",
                   "5tera_xrn-1-KD_smg-5_rerun": "newerS5",
                   "sPM57": "sPM57",
                   "sPM58": "sPM58",
                   }
REV_CONVERSION_DICT = {val: key for key, val in CONVERSION_DICT.items()}

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

# Load pre-processed files:

In [None]:
regenerate = False
libs_to_load = sorted({
    'oldN2',
    'newN2',
    'newerN2',
    'oldS6',
    # 'newS6',
    'newerS6',
    # 'newS5',
    'newerS5',
    'newS7',
})

try:
    if regenerate:
        raise FileNotFoundError
    
    reads_df_raw_path = npCommon.find_newest_matching_file(f"./output_files/mega_merge_parquets/*_{'-'.join(libs_to_load)}_merged5TERA.reads_df.parquet")
    compressed_df_genes_raw_path = npCommon.find_newest_matching_file(f"./output_files/mega_merge_parquets/*_{'-'.join(libs_to_load)}_merged5TERA.compressed_df.parquet")
    print(f"Found preprocessed files at:\n\t{reads_df_raw_path}\nand:\n\t{compressed_df_genes_raw_path}")

    reads_df_genes_raw = pd.read_parquet(reads_df_raw_path)
    compressed_df_genes_raw = pd.read_parquet(compressed_df_genes_raw_path)
except FileNotFoundError:
    print(f"Could not find preprocessed files matching these libs: {'/'.join(libs_to_load)}\nGoing to create new ones from scratch! This will take longer.")
    reads_df_genes_raw, compressed_df_genes_raw = npCommon.load_and_merge_lib_parquets([REV_CONVERSION_DICT[lib] for lib in libs_to_load],
                                                                                       drop_sub_n=1,
                                                                                       add_tail_groupings=False,
                                                                                       drop_failed_polya=False,
                                                                                       group_by_t5=True,
                                                                                       use_josh_assignment=False)
    print(f"Saving new parquets to speed up future runs.")
    reads_df_genes_raw.to_parquet(f"./output_files/mega_merge_parquets/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_merged5TERA.reads_df.parquet")
    compressed_df_genes_raw.to_parquet(f"./output_files/mega_merge_parquets/{npCommon.get_dt()}_{'-'.join(libs_to_load)}_merged5TERA.compressed_df.parquet")
print(f"Lib load done @ {npCommon.get_dt(for_print=True)}")

compressed_df_genes_short = compressed_df_genes_raw.copy()[["lib", "chr_id", "gene_id", "gene_name", "t5", "gene_hits", "gene_rpm"]]
compressed_df_genes_short.query("gene_name == 'rpl-12'")

# Format libraries as columns with shortened names

In [None]:
conversion_dict = CONVERSION_DICT
ans = [y for x, y in compressed_df_genes_short.groupby(['lib', 't5'], as_index=False)]
df_dict = {}
for i, df in enumerate(ans):
    lib = df.lib.unique()[0]
    t5 = df.t5.unique()[0]
    df = df[["chr_id", "gene_id", "gene_name", "gene_hits", "gene_rpm"]]
    df = df.rename(columns={col: f'{col}_{conversion_dict[lib]}_t5{t5}' for col in df.columns if col not in ["chr_id", "gene_id", "gene_name"]})
    df_dict[(conversion_dict[lib], t5)] = df.set_index(["chr_id", "gene_id", "gene_name"])
    # print((conversion_dict[lib], t5))
    # print(df_dict[(conversion_dict[lib], t5)].query("gene_name == 'rpl-12'"))

super_df = pd.concat(df_dict.values(), axis=1, join='outer').fillna(0)
super_df

In [None]:
# This step will calculate total hits and the fraction adapted for each gene

filter_df_raw = pd.DataFrame()
for lib in libs_to_load:
    for rpm_or_hits in ["rpm", "hits"]:
        super_df[f"total_gene_{rpm_or_hits}_{lib}"] = super_df[f"gene_{rpm_or_hits}_{lib}_t5+"] + super_df[f"gene_{rpm_or_hits}_{lib}_t5-"]
    super_df[f"fraction_adapted_{lib}"] = super_df[f"gene_hits_{lib}_t5+"] / super_df[f"total_gene_hits_{lib}"]
    
    cols_to_carry_over = [col for col in super_df.columns if lib in col]
    filter_df_raw[cols_to_carry_over] = super_df[cols_to_carry_over]
filter_df_raw

In [None]:
from scipy.stats import chi2_contingency, chisquare, fisher_exact, boschloo_exact, barnard_exact
def row_chi2(row, target_lib_1, target_lib_2):
    array = np.array([[row[f"gene_hits_{target_lib_1}_t5-"], row[f"gene_hits_{target_lib_2}_t5-"]],
                      [row[f"gene_hits_{target_lib_1}_t5+"], row[f"gene_hits_{target_lib_2}_t5+"]]])
    try:
        chi2, p, deg_of_free, expected = chi2_contingency(array)
        return chi2, p
    except ValueError:
        return None, None

def row_fishers_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided'):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for fisher's exact test!!")
    odds_ratio, p_value = fisher_exact(array, alternative=alternative)
    return odds_ratio, p_value

def row_boschloo_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided', sampling_points=32):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for Boschloo's exact test!!")
    boschloo_result = boschloo_exact(array, alternative=alternative, n=sampling_points)
    return boschloo_result.statistic, boschloo_result.pvalue

def row_barnard_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided', sampling_points=32):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for Barnard's exact test!!")
    barnard_result = barnard_exact(array, alternative=alternative, n=sampling_points)
    return barnard_result.statistic, barnard_result.pvalue

# Use the Stat Tests!

Realization on 6/22/2023:
I think I should be doing the cutoff filtering for all libraries at once, not just the target libraries. I have been doing it per-lib because that makes more sense in the context of "THAT TEST" but now the paper is more using this as a comparison between newerS6 and newerS5 effects. So I should just stick to filtering all genes, then do the stats... Damn.

In [None]:
from pathlib import Path

filter_df = filter_df_raw.copy()

output_dir = f"/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python/{npCommon.get_dt()}_fishers"
Path(output_dir).mkdir(parents=True, exist_ok=True)

stat_test_record_file = Path(f"{output_dir}/{npCommon.get_dt()}_stat_test_record_file.txt")

stat_test_record_file.write_text(f"{npCommon.get_dt(for_print=True)}\n")

def print_and_write(string):
    print(string)
    with open(stat_test_record_file, 'a') as f:
        f.write(string + "\n")

# p-value setpoint and the applied cutoffs will be used for a Bonferroni correction
#   Currently no genes will be dropped based on these filters/cutoffs
base_sig_cutoff = 0.05

cumulative_min_read_cutoff = 100
filter_with_rpm_or_hits = "hits"
filter_col_target = "total"  # "total" or "adapted" or "unadapted"
tests_to_run = [
    "fishers",
    # "boschloo",
    # "barnard",
    # "chi2",
]

p_value_cutoff_dict = {}

lib_comparisons_to_test = []
# lib_comparisons_to_test += list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"]))
# lib_comparisons_to_test += [("newN2", "newS6")]
lib_comparisons_to_test += [("oldN2", "oldS6")]
lib_comparisons_to_test += [("newerN2", "newerS6")]
lib_comparisons_to_test += [("newerN2", "newerS5")]
lib_comparisons_to_test += [("newN2", "newS7")]
# lib_comparisons_to_test += [("newerN2", "oldN2")]
# lib_comparisons_to_test += [("sPM57", "sPM58")]


gene_lists = {}

for libs in lib_comparisons_to_test:
    first_lib, second_lib = libs
    # Run the tests:
    with warnings.catch_warnings():
        if "chi2" in tests_to_run:
            tqdm.pandas(desc=f"Calculating Chi Squared for {first_lib} and {second_lib}")
            filter_df[[f"{first_lib}_v_{second_lib}_chi2_test_result", f"{first_lib}_v_{second_lib}_chi2_p_value"]] = filter_df.progress_apply(lambda row: row_chi2(row, first_lib, second_lib), axis=1, result_type="expand")
        if "fishers" in tests_to_run:
            tqdm.pandas(desc=f"Calculating Fisher's exact for {first_lib} and {second_lib}")
            filter_df[[f"{first_lib}_v_{second_lib}_fishers_test_result", f"{first_lib}_v_{second_lib}_fishers_p_value"]] = filter_df.progress_apply(lambda row: row_fishers_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less'), axis=1, result_type="expand")
        
        # Barnard and Boschloo take so so long to run and don't provide much of a power increase...
        if "barnard" in tests_to_run:
            tqdm.pandas(desc=f"Calculating Barnard's exact for {first_lib} and {second_lib}")
            filter_df[[f"{first_lib}_v_{second_lib}_barnard_test_result", f"{first_lib}_v_{second_lib}_barnard_p_value"]] = filter_df.progress_apply(lambda row: row_barnard_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less', sampling_points=4), axis=1, result_type="expand")
        if "boschloo" in tests_to_run:
            tqdm.pandas(desc=f"Calculating Boschloo exact for {first_lib} and {second_lib}")
            filter_df[[f"{first_lib}_v_{second_lib}_boschloo_test_result", f"{first_lib}_v_{second_lib}_boschloo_p_value"]] = filter_df.progress_apply(lambda row: row_boschloo_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less', sampling_points=4), axis=1, result_type="expand")
    # Make adjustments for multiple testing:
    for stat_test in tests_to_run:
        lib_cols_for_correction = []
        for lib in libs:
            filter_col_converter = {"total": f"total_gene_{filter_with_rpm_or_hits}_{lib}",
                                    "adapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5+",
                                    "unadapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5-"}
            lib_cols_for_correction.append(filter_col_converter[filter_col_target])
        cumulative_col_name = f"cumulative_{filter_col_target}_{filter_with_rpm_or_hits}_{first_lib}_{second_lib}"
        filter_df[cumulative_col_name] = filter_df[lib_cols_for_correction[0]] + filter_df[lib_cols_for_correction[1]]
        number_of_genes_passing_cutoff = filter_df[filter_df[cumulative_col_name] >= cumulative_min_read_cutoff].shape[0]
        passed_cutoff_df = filter_df[filter_df[cumulative_col_name] >= cumulative_min_read_cutoff]
        gene_lists[(libs, stat_test)] = passed_cutoff_df.index
        adjusted_sig_cutoff = base_sig_cutoff / number_of_genes_passing_cutoff
        p_value_cutoff_dict[(libs, stat_test)] = (adjusted_sig_cutoff, number_of_genes_passing_cutoff)
        
        print_and_write(f"There were {number_of_genes_passing_cutoff} genes that passed the cutoff of having >={cumulative_min_read_cutoff} cumulative {filter_col_target} {filter_with_rpm_or_hits} between {first_lib} and {second_lib}"
              f"\n\tA Bonferroni correction with this in mind will expect a p value of {adjusted_sig_cutoff:.3g} for a significant {stat_test} test result.")
        print_and_write(f"***But I think this isn't a great way to filter, I am going to recalculate the cutoff based on the number of genes that pass the filter for ALL LIBRARIES TESTED... This does mean the pool of libs tested matters a lot more!***")
# Now we need to recalculate the cutoff based on the number of genes that pass the filter for ALL LIBRARIES TESTED:
# First we'll create the tested_genes_df and remove all duplicates, then we'll calculate the adjusted cutoff:
stat_test_sig_cutoffs = {}
stat_tested_genes_dfs = {}
for stat_test in tests_to_run:
    tested_genes_df = pd.DataFrame(columns=['gene_id', 'gene_name'])
    for (libs, list_stat_test), gene_list in gene_lists.items():
        if stat_test != list_stat_test:
            continue
        lib1, lib2 = libs
        gene_list_better = gene_list.to_frame().reset_index(drop=True)[["gene_id", "gene_name"]]
        gene_list_better[f'tested_for_{stat_test}_{lib1}-v-{lib2}'] = True
        tested_genes_df = pd.merge(tested_genes_df, gene_list_better, how='outer', on=['gene_id', 'gene_name'])
        tested_genes_df = tested_genes_df.fillna(False)
        tested_genes_df = tested_genes_df.drop_duplicates()
    tested_genes_df[f'tested_{stat_test}_for_all'] = tested_genes_df.all(axis='columns')
    total_genes_tested_count = tested_genes_df.shape[0]
    universally_tested_genes_count = tested_genes_df[tested_genes_df[f'tested_{stat_test}_for_all']].shape[0]
    adjusted_sig_cutoff = base_sig_cutoff / universally_tested_genes_count
    stat_test_sig_cutoffs[stat_test] = adjusted_sig_cutoff
    stat_tested_genes_dfs[stat_test] = tested_genes_df
    print_and_write(f"Of the {total_genes_tested_count} genes tested for {stat_test}, {universally_tested_genes_count} were tested among all libraries.")
    # Add a column to the filter_df, that stores a Bool depending on if the gene was in the tested_genes_df[f'tested_{stat_test}_for_all'] column:
    temp_filter_df = filter_df.copy().reset_index()
    temp_filter_df = pd.merge(temp_filter_df, tested_genes_df[['gene_id', 'gene_name', f'tested_{stat_test}_for_all']], how='left', on=['gene_id', 'gene_name'])
    temp_filter_df[f'tested_{stat_test}_for_all'] = temp_filter_df[f'tested_{stat_test}_for_all'].fillna(False)
    filter_df = temp_filter_df.set_index(['chr_id', 'gene_id', 'gene_name'])


for stat_test in tests_to_run:
    for (first_lib, second_lib) in lib_comparisons_to_test:
        try:
            filter_df.sort_values(f"{first_lib}_v_{second_lib}_{stat_test}_p_value")
            filter_df[f"neg_log10_{first_lib}_v_{second_lib}_{stat_test}_p_value"] = -np.log10(filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_p_value"])
            filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_significant"] = filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_p_value"] <= stat_test_sig_cutoffs[stat_test]
            # Some nice printing!
            #####################
            print_and_write(f"For {first_lib} v {second_lib} {stat_test} test:")
            print_and_write(f"    {filter_df[filter_df[f'{first_lib}_v_{second_lib}_{stat_test}_significant']].shape[0]} genes were significant at a p value of {stat_test_sig_cutoffs[stat_test]:.3g} (Bonferroni corrected for all genes tested)")
            sig_genes = filter_df[filter_df[f'{first_lib}_v_{second_lib}_{stat_test}_significant']][[f'{first_lib}_v_{second_lib}_{stat_test}_p_value']].reset_index()[['gene_id', 'gene_name', f'{first_lib}_v_{second_lib}_{stat_test}_p_value']].sort_values(f'{first_lib}_v_{second_lib}_{stat_test}_p_value')
            sig_genes_list = sig_genes.values.tolist()
            sig_genes_string = '\n'.join([f"        {gene_id}    {gene_name:<10}    p-val: {p_value:.3g}" for gene_id, gene_name, p_value in sig_genes_list])
            print_and_write(f"    Genes:\n{sig_genes_string}")
            #####################
            
        except KeyError:
            print(f"Couldn't find columns corresponding to '{stat_test}'!! Be sure spelling is correct!")
    print(f"Total Count of genes tested: {filter_df.shape[0]}, with {filter_df[filter_df[f'tested_{stat_test}_for_all']].shape[0]} genes tested among all libraries.")
# Let's just roll back to a simple save, writing the entire df to a csv
df_output_path = stat_test_record_file.parent / f"{npCommon.get_dt()}_statTests_{'-'.join(tests_to_run)}_largeDF.csv"
print_and_write(f"Saving the entire df to {df_output_path}...")
filter_df.to_csv(df_output_path)
print_and_write('done.')

# Look at the top "most significant" hits

In [None]:

how_many_hits_to_save = None
load_and_merge_simple = True
for stat_test in tests_to_run:
    for first_lib, second_lib in lib_comparisons_to_test:
        temp_df = filter_df[[f"gene_rpm_{first_lib}_t5-",
                            f"gene_rpm_{first_lib}_t5+",
                            f"gene_rpm_{second_lib}_t5-",
                            f"gene_rpm_{second_lib}_t5+",
                            f"{first_lib}_v_{second_lib}_{stat_test}_test_result",
                            f"{first_lib}_v_{second_lib}_{stat_test}_p_value",
                            f"{first_lib}_v_{second_lib}_{stat_test}_significant",
                            ]].sort_values(f"{first_lib}_v_{second_lib}_{stat_test}_p_value",
                                           ascending=True)
        save_path = f"./output_files/{npCommon.get_dt()}_{first_lib}_v_{second_lib}.{stat_test}"
        if isinstance(how_many_hits_to_save, int):
            temp_df = temp_df.head(how_many_hits_to_save)
            save_path += f".top{how_many_hits_to_save}"
        save_path += ".csv"
        print(temp_df)
        temp_df.to_csv(save_path)
        print(f"Saved top {how_many_hits_to_save} hits to {save_path}")
if load_and_merge_simple:
    df_dict_fishers = {}
    for first_lib, second_lib in lib_comparisons_to_test:
        load_path = f"./output_files/{npCommon.get_dt()}_{first_lib}_v_{second_lib}.fishers"
        if isinstance(how_many_hits_to_save, int):
            load_path += f".top{how_many_hits_to_save}"
        load_path += ".csv"
        df_dict_fishers[(first_lib, second_lib)] = pd.read_csv(load_path)[['gene_name', 'gene_id', f"{first_lib}_v_{second_lib}_fishers_p_value", f"{first_lib}_v_{second_lib}_fishers_significant"]]
    # Next I want to merge all of these dataframes in df_dict_fishers into one based on the gene_id and gene_name columns:
    df_dict_fishers_merged = pd.DataFrame(columns=['gene_name', 'gene_id'])
    for first_lib, second_lib in lib_comparisons_to_test:
        df_dict_fishers_merged = df_dict_fishers_merged.merge(df_dict_fishers[(first_lib, second_lib)], how='outer', on=['gene_name', 'gene_id'])
    print(df_dict_fishers_merged.head())
    df_dict_fishers_merged.to_csv(f"./output_files/{npCommon.get_dt()}_merged_fishers.csv")
df_dict_fishers_merged

# Plot fraction adapted fold change and Chi2/Fishers p-values

In [None]:
from plotly.subplots import make_subplots
def plot_multiple_test_scatters(multi_plot_df, lib_sets_to_plot, stat_test_to_plot, save_path_for_fig, round_inf_to=8, filter_based_on_all_not_lib_alone=True, shapes_col=None):
    for first_lib, second_lib in lib_sets_to_plot:
        multi_plot_df[f"fracAdapted_FC_{first_lib}/{second_lib}"] = np.log2(
            multi_plot_df[f"fraction_adapted_{first_lib}"] / multi_plot_df[f"fraction_adapted_{second_lib}"])
    multi_plot_df = multi_plot_df.reset_index()
    multi_plot_df = multi_plot_df[multi_plot_df.chr_id != 'MtDNA']
    multi_plot_df = multi_plot_df.replace({-np.inf: -round_inf_to, np.inf: round_inf_to})
    
    fig = make_subplots(rows=1, cols=len(lib_sets_to_plot),
                        shared_yaxes=True,
                        shared_xaxes=True,
                        vertical_spacing=0.02,
                        horizontal_spacing=0.02,
                        # subplot_titles=[f"{libs[0]} vs {libs[1]}" for libs in lib_comparisons_to_plot],
                        )
    for i, (first_lib, second_lib) in enumerate(lib_sets_to_plot):
        print(first_lib, second_lib)
        x_col = f"fracAdapted_FC_{first_lib}/{second_lib}"
        y_col = f"neg_log10_{first_lib}_v_{second_lib}_{stat_test_to_plot}_p_value"
        sig_col = f"{first_lib}_v_{second_lib}_{stat_test_to_plot}_significant"
        
        if shapes_col is None:
            marker_symbol_col = None
        elif multi_plot_df[shapes_col].dtype == bool:
            multi_plot_df[f'{shapes_col}_{first_lib}/{second_lib}'] = multi_plot_df[shapes_col] & multi_plot_df[sig_col]
            marker_symbol_col = multi_plot_df[f'{shapes_col}_{first_lib}/{second_lib}'].replace({False: 'circle', True: 'cross'})
        elif multi_plot_df[shapes_col].dtype == int:
            new_shapes_col = f'{shapes_col}_{first_lib}/{second_lib}'
            multi_plot_df[new_shapes_col] = multi_plot_df[shapes_col]
            multi_plot_df.loc[~multi_plot_df[sig_col], new_shapes_col] = 0
            marker_symbol_col = multi_plot_df[new_shapes_col].replace({0: 'circle', 1: 'cross', 2: 'triangle-up', 3: 'square'})
        else:
            raise TypeError(f"shapes_col must be a bool or int dtype, not {multi_plot_df[shapes_col].dtype}:\n\n{multi_plot_df[shapes_col].head()}")
        
        
        fig.add_trace(go.Scatter(
            x=multi_plot_df[x_col],
            y=multi_plot_df[y_col],
            marker_color=multi_plot_df[sig_col].replace({False: "black", True: "red"}),
            marker_symbol=marker_symbol_col,
            hovertext=multi_plot_df["gene_name"],
            # start GitHub copilot help. It made this HELLA easy!!:
            customdata=multi_plot_df[[f"total_gene_rpm_{first_lib}", f"total_gene_rpm_{second_lib}", "prev_id_list"]],
            hovertemplate="<b>%{hovertext}</b><br>" +
                          "FC Fraction Adapted: %{x:.2f}<br>" +
                          "-log<sub>10</sub> P-Value: %{y:.2e}<br>" +
                          f"total gene RPM {first_lib}: " + "%{customdata[0]:.2f}<br>" +
                          f"total gene RPM {second_lib}: " + "%{customdata[1]:.2f}<br>" +
                          "prev_id_list: %{customdata[2]}<br>" +
                          "<extra></extra>",
            # end GitHub copilot help
            mode="markers",
            name=f"{second_lib}"),
            row=1, col=i + 1)
        if filter_based_on_all_not_lib_alone:
            cutoff = stat_test_sig_cutoffs[stat_test]
            num_passed_genes = multi_plot_df.tested_fishers_for_all.sum()
        else:
            cutoff, num_passed_genes = p_value_cutoff_dict[((first_lib, second_lib), stat_test_to_plot)]
        fig.add_hline(y=-np.log10(cutoff),
                      row=1, col=i + 1)
        fig.add_annotation(text=f"Bonferroni<br>p-value = {cutoff:0.3e}",
                           yref="y",
                           y=-np.log10(cutoff),
                           xref="x domain",
                           x=0,
                           yanchor="bottom",
                           showarrow=False,
                           align="left",
                           row=1, col=i + 1)
        
        # Bottom of plot text:
        bottom_text = f"<b>genes w/ > 100 {filter_col_target} {filter_with_rpm_or_hits} = {num_passed_genes}"
        fig.add_annotation(text=bottom_text,
                           yref="y domain",
                           y=0,
                           xref="x domain",
                           x=0.5,
                           yanchor="bottom",
                           showarrow=False,
                           align="center",
                           row=1, col=i + 1)
        # Cutoffs to isolate the +INF genes:
        fig.add_vrect(x0=round_inf_to - 1.0,
                      x1=round_inf_to - 0.5,
                      line_width=0,
                      fillcolor='white',
                      )
        fig.add_vrect(x0=round_inf_to + 1.0,
                      x1=round_inf_to + 0.5,
                      line_width=0,
                      fillcolor='white',
                      )
        # X Axis Title:
        fig.update_xaxes(title_text=f"Fold Change Fraction Adapted<br>N2 vs. smg-{second_lib[-1]} ({second_lib[:-2]})",
                         row=1, col=i + 1)
    fig.update_layout(
        font=dict(
            family="Rockwell",  # "Rockwell" "Courier New, monospace"
            size=16,  # Set the font size here
            color="black"
        ),
        hoverlabel=dict(
            bgcolor="white",
            font_size=20,
            font_family="Rockwell",  # "Rockwell" "Courier New, monospace"
        ),
        showlegend=False,
    )
    # Title:
    title_text = (f"<b>FC Frac. Adapted vs. {stat_test_to_plot.title()} Test P-Values</b>"
                 f"<br>Cumulative {filter_col_target} {filter_with_rpm_or_hits} for Cutoff = {cumulative_min_read_cutoff}")
    if shapes_col is not None:
        title_text += f"<br>Cross = {shapes_col}"
    fig.update_layout(template="plotly_white",
                      title=title_text)
    fig.update_xaxes(tickvals=[f"-{round_inf_to}",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               f"{round_inf_to}"],
                     ticktext=["-INF",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               "INF"],
                     range=[-2, round_inf_to+0.5])
    fig.update_yaxes(title_text=f"-log<sub>10</sub> {stat_test_to_plot.title()} Test P-Value",
                     row=1, col=1)

    fig.write_html(save_path_for_fig + ".html")
    fig.update_layout(
        autosize=False,
        width=600*len(lib_sets_to_plot),
        height=600, )
    fig.write_image(save_path_for_fig + ".svg")
    fig.write_image(save_path_for_fig + ".png")
    fig.show()
    return multi_plot_df

stat_test = "fishers"  # "fishers" or "chi2"
lib_comparisons_to_plot = [
    # ("newN2", "newS5"),
    # ("newN2", "newS6"),
    # ("newN2", "newS7"),
    ("oldN2", "oldS6"),
    # ("sPM57", "sPM58"),
    ("newerN2", "newerS6"),
    ("newerN2", "newerS5"),
    ("newN2", "newS7"),
    # ("newerN2", "oldN2"),
]
flat_plot_libs = [item for sublist in lib_comparisons_to_plot for item in sublist]
flat_plot_libs = sorted(list(set(flat_plot_libs)))
save_dir = Path(f"{output_dir}")
save_name =  f"{npCommon.get_dt()}_{'-'.join(flat_plot_libs)}_{stat_test}Pvalues-v-FCfractionAdapted.scatters"
plot_df = filter_df.copy(deep=True)
# plot_df = plot_df[plot_df.tested_fishers_for_all]

### Adding on 6/27/2023 to add shapes depending on previously identified NMD targets:
marker_shape_col = "prev_id_count" # ramani2009_hit, muir2018_hit, kim_modena2022_smg6_hit, kim_modena2022_smg1_hit, or mitrovich_hit
prev_targets_df = pd.read_csv("/data16/marcus/working/230602_comparingNMDTargets/230627_totalMerge.csv")
prev_targets_df['kim_modena2022_hit'] = (prev_targets_df['kim_modena2022_smg1_hit'] | prev_targets_df['kim_modena2022_smg6_hit'])
prev_targets_df['prev_id_count'] = prev_targets_df[['muir2018_hit', 'ramani2009_hit', 'kim_modena2022_hit']].sum(axis=1)
prev_targets_df['prev_id_list'] = prev_targets_df[['muir2018_hit', 'ramani2009_hit', 'kim_modena2022_hit']].apply(lambda row: ", ".join([col.rstrip("_hit") for col in row.index if row[col]]), axis=1)
plot_df = plot_df.reset_index()
# Drop columns that show up in prev_targets AND plot_df
plot_df = plot_df.merge(prev_targets_df[['gene_name', 'gene_id', marker_shape_col, 'prev_id_list']], on=['gene_name', 'gene_id'], how="left").set_index(['chr_id', 'gene_id', 'gene_name'])
if marker_shape_col == "prev_id_count":
    plot_df[marker_shape_col] = plot_df[marker_shape_col].fillna(0)
    plot_df[marker_shape_col] = plot_df[marker_shape_col].astype(int)
else:
    plot_df[marker_shape_col] = plot_df[marker_shape_col].fillna(False)
save_name += f".withShapesFrom_{marker_shape_col}"
###

save_path = save_dir / save_name
plot_df = plot_multiple_test_scatters(plot_df, lib_comparisons_to_plot, stat_test, str(save_path), shapes_col=marker_shape_col, filter_based_on_all_not_lib_alone=True)

plot_df.query("gene_name == 'F19B2.5'").T

# Looking at reproducibility
## April 3, 2023

In [None]:

lib_comparisons_to_compare = [  # Try to just do two for now!
    # ("newN2", "newS5"),
    # ("newN2", "newS6"),
    # ("newN2", "newS7"),
    # ("oldN2", "oldS6"),
    ("newerN2", "newerS6"),
    ("oldN2", "oldS6"),
    # ("newerN2", "newerS5"),
    # ("newN2", "newS7"),
]
stat_test = 'fishers'  # 'fishers' or 'chi2'

(x_lib1, x_lib2), (y_lib1, y_lib2) = x_libs, y_libs = lib_comparisons_to_compare

lib_comparisons_to_filter = [f"{libs[0]}_v_{libs[1]}" for libs in lib_comparisons_to_compare]
filter_cols = [col for col in plot_df.columns if any([filter_substring in col for filter_substring in lib_comparisons_to_filter]) or col in ['chr_id', 'gene_id', 'gene_name']]
filter_cols += [col for col in plot_df.columns if any([lib_name in col for sublist in lib_comparisons_to_compare for lib_name in sublist])]
filter_cols = set(filter_cols)
print(filter_cols)
filter_cols = [col for col in filter_cols if not "chi2" in col]
new_df = plot_df[filter_cols]

tested_genes_list = stat_tested_genes_dfs[stat_test][stat_tested_genes_dfs[stat_test][f'tested_{stat_test}_for_all']].gene_id.to_list()
new_df = new_df.query(f"gene_id in @tested_genes_list")
sea.set_style('white')

# npCommon.boolDF_to_upsetPlot(new_df)
# new_new_df = new_df.select_dtypes(include='bool')
# new_new_df[new_new_df[new_new_df.columns[0]] + new_new_df[new_new_df.columns[1]]].merge(new_df['gene_name'], right_index=True, left_index=True)
print(new_df.columns)
x_col = f'neg_log10_{x_lib1}_v_{x_lib2}_fishers_p_value'
y_col = f'neg_log10_{y_lib1}_v_{y_lib2}_fishers_p_value'
# new_df = new_df[~new_df[x_col].isna()]
# new_df = new_df[~new_df[y_col].isna()]
new_df['average_rpm'] = (new_df[f'total_gene_rpm_{x_lib1}'] +
                         new_df[f'total_gene_rpm_{x_lib2}'] +
                         new_df[f'total_gene_rpm_{y_lib1}'] +
                         new_df[f'total_gene_rpm_{y_lib2}']) / 4 + 1
rpm_dict = {f'total_gene_rpm_{lib}': f'Gene RPM ({lib})' for lib in [x_lib1, x_lib2, y_lib1, y_lib2]}
labels_dict = {
                     x_col: f"-log<sub>10</sub> {stat_test.title()} Test P-Value ({x_lib1} vs {x_lib2})",
                     y_col: f"-log<sub>10</sub> {stat_test.title()} Test P-Value ({y_lib1} vs {y_lib2})",
                     "chr_id": "Chromosome",
                 }
labels_dict.update(rpm_dict)
# Just drop the freaking weird pseudogenes: F23A7.4 and .8:
new_df = new_df[~new_df.gene_name.isin(['F23A7.4', 'F23A7.8'])]
print(new_df.head())
fig = px.scatter(new_df[new_df.average_rpm > 150],
                 x=x_col,
                 y=y_col,
                 hover_name='gene_name',
                 hover_data=['chr_id'] + list(rpm_dict),
                 template='none',
                 # size='average_rpm',
                 height=1000,
                 width=1000,
                 labels=labels_dict,
                 )
fig.update_traces(marker=dict(color='black',
                              size=10))
x_cutoff, _ = p_value_cutoff_dict[(x_libs, stat_test)]
y_cutoff, _ = p_value_cutoff_dict[(y_libs, stat_test)]
fig.add_shape(y0=-np.log10(y_cutoff), y1=-np.log10(y_cutoff), yref='y',
              x0=0, x1=1, xref='paper',
              type='line', line_color='salmon', line_width=3, line_dash='dot',
              opacity=1,
              )
fig.add_shape(x0=-np.log10(y_cutoff), x1=-np.log10(y_cutoff), xref='x',
              y0=0, y1=1, yref='paper',
              type='line', line_color='salmon', line_width=3, line_dash='dot',
              opacity=1,
              )
fig.update_yaxes(title_text=f"<b>-log<sub>10</sub> {stat_test.title()} Test P-Value<br>"
                            f"{y_lib1} vs {y_lib2}")
fig.update_xaxes(title_text=f"<b>-log<sub>10</sub> {stat_test.title()} Test P-Value<br>"
                            f"{x_lib1} vs {x_lib2}")
fig.update_yaxes(rangemode='nonnegative')
fig.update_xaxes(rangemode='nonnegative')
fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=18,  # Set the font size here
        color="black"
    ),
     hoverlabel=dict(
        bgcolor="white",
        font_size=20,
        font_family="Rockwell"
    )
)
fig.update_layout(yaxis = dict( tickfont = dict(size=20)),
                  xaxis = dict( tickfont = dict(size=20)))
save_dir = "/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python"
save_path = save_dir + f"/{npCommon.get_dt()}_-log10{stat_test.title()}_{y_lib1.rstrip('N2')}_v_{x_lib1.rstrip('N2')}"
fig.write_html(save_path + ".html")
fig.write_image(save_path + ".svg")
fig.write_image(save_path + ".png")
fig.show()

# Plot the relationship between fraction adapted between smg-5 and smg-6
This will go with the similar plots of P-values

In [None]:

lib_comparisons_to_compare = [  # Try to just do two for now!
    # ("newN2", "newS5"),
    # ("newN2", "newS6"),
    # ("newN2", "newS7"),
    # ("oldN2", "oldS6"),
    ("newerN2", "newerS6"),
    ("newerN2", "newerS5"),
]
stat_test = 'fishers'  # 'fishers' or 'chi2'

(x_lib1, x_lib2), (y_lib1, y_lib2) = x_libs, y_libs = lib_comparisons_to_compare

lib_comparisons_to_filter = [f"{libs[0]}_v_{libs[1]}" for libs in lib_comparisons_to_compare]
filter_cols = [col for col in plot_df.columns if any([filter_substring in col for filter_substring in lib_comparisons_to_filter]) or col in ['chr_id', 'gene_id', 'gene_name']]
filter_cols += [col for col in plot_df.columns if any([lib_name in col for sublist in lib_comparisons_to_compare for lib_name in sublist])]
filter_cols = set(filter_cols)
print(filter_cols)
filter_cols = [col for col in filter_cols if not "chi2" in col]
new_df = plot_df[filter_cols]

tested_genes_list = stat_tested_genes_dfs[stat_test][stat_tested_genes_dfs[stat_test][f'tested_{stat_test}_for_all']].gene_id.to_list()
new_df = new_df.query(f"gene_id in @tested_genes_list")
sea.set_style('white')

# npCommon.boolDF_to_upsetPlot(new_df)
# new_new_df = new_df.select_dtypes(include='bool')
# new_new_df[new_new_df[new_new_df.columns[0]] + new_new_df[new_new_df.columns[1]]].merge(new_df['gene_name'], right_index=True, left_index=True)
print(new_df.columns)
# x_col = f'neg_log10_{x_lib1}_v_{x_lib2}_fishers_p_value'
x_col = f"fracAdapted_FC_{x_lib1}/{x_lib2}"
# y_col = f'neg_log10_{y_lib1}_v_{y_lib2}_fishers_p_value'
y_col = f"fracAdapted_FC_{y_lib1}/{y_lib2}"
# new_df = new_df[~new_df[x_col].isna()]
# new_df = new_df[~new_df[y_col].isna()]
new_df['average_rpm'] = (new_df[f'total_gene_rpm_{x_lib1}'] +
                         new_df[f'total_gene_rpm_{x_lib2}'] +
                         new_df[f'total_gene_rpm_{y_lib1}'] +
                         new_df[f'total_gene_rpm_{y_lib2}']) / 4 + 1
rpm_dict = {f'total_gene_rpm_{lib}': f'Gene RPM ({lib})' for lib in [x_lib1, x_lib2, y_lib1, y_lib2]}
labels_dict = {
                     x_col: f"Fraction Adapted Fold Change ({x_lib1} vs {x_lib2})",
                     y_col: f"Fraction Adapted Fold Change ({y_lib1} vs {y_lib2})",
                     "chr_id": "Chromosome",
                 }
labels_dict.update(rpm_dict)
# Just drop the freaking weird pseudogenes: F23A7.4 and .8:
new_df = new_df[~new_df.gene_name.isin(['F23A7.4', 'F23A7.8'])]
print(new_df.head())
fig = px.scatter(new_df[new_df.average_rpm > 150],
                 x=x_col,
                 y=y_col,
                 hover_name='gene_name',
                 hover_data=['chr_id'] + list(rpm_dict),
                 template='none',
                 # size='average_rpm',
                 height=1000,
                 width=1000,
                 labels=labels_dict,
                 )
# I want to color the points by if they are significant in the fishers test:


fig.update_traces(marker=dict(color='black',
                              size=10))
fig.update_yaxes(rangemode='nonnegative')
fig.update_xaxes(rangemode='nonnegative')
fig.update_layout(
    font=dict(
        family="Courier New, monospace",
        size=18,  # Set the font size here
        color="black"
    ),
     hoverlabel=dict(
        bgcolor="white",
        font_size=20,
        font_family="Rockwell"
    )
)
round_inf_to = 8
fig.update_xaxes(tickvals=[f"-{round_inf_to}",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               f"{round_inf_to}"],
                  ticktext=["-INF",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               "INF"],
                  range=[-2, round_inf_to+0.5])
fig.update_yaxes(tickvals=[f"-{round_inf_to}",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               f"{round_inf_to}"],
                  ticktext=["-INF",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               "INF"],
                  range=[-2, round_inf_to+0.5])

fig.update_layout(yaxis = dict( tickfont = dict(size=20)),
                  xaxis = dict( tickfont = dict(size=20)))
save_dir = "/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python"
save_path = save_dir + f"/{npCommon.get_dt()}_fracAdaptedFC_{y_lib1.rstrip('N2')}_v_{x_lib1.rstrip('N2')}"
fig.write_html(save_path + ".html")
fig.write_image(save_path + ".svg")
fig.write_image(save_path + ".png")
fig.show()

# Nested Bar Plots of Fraction Adapted FC for smg-5 and smg-6

In [None]:
small_df = new_df[(new_df.newerN2_v_newerS5_fishers_significant == True) | (new_df.newerN2_v_newerS6_fishers_significant == True)]


# So we want to make a nested bar plot, with the two colors being the fracAdapted_FC_newerN2/newerS6 and fracAdapted_FC_newerN2/newerS5.
# The X-axis will be all of the genes in small_df
# The Y-axis will be the fraction adapted FC
# We want each bar side by side for each gene, not staked:

fig = px.bar(small_df,
             x="gene_name",
             y=["fracAdapted_FC_newerN2/newerS6", "fracAdapted_FC_newerN2/newerS5", "fracAdapted_FC_oldN2/oldS6"],
             # color_discrete_sequence=["red", "blue"],
             template="none",
             height=500,
             width=1000,
             barmode="group",
             )

fig.update_yaxes(tickvals=[f"-{round_inf_to}",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               f"{round_inf_to}"],
                  ticktext=["-INF",
                               "-6", "-4", "-2", "0", "2", "4", "6",
                               "INF"],
                  range=[-2, round_inf_to+0.5])
# fig.update_layout(yaxis = dict( tickfont = dict(size=20)),
#                     xaxis = dict( tickfont = dict(size=20)))
# change y axis name:
fig.update_yaxes(title_text="Fraction Adapted Fold Change<br>newerN2 vs. newerS6/newerS5")

save_dir = "/home/marcus/Insync/mviscard@ucsc.edu/Google Drive/insync_folder/NMD_cleavage_and_deadenylation_paper/raw_figures_from_python"
save_path = save_dir + f"/{npCommon.get_dt()}_fracAdaptedFC_nestedBarPlot"
fig.write_html(save_path + ".html")
fig.write_image(save_path + ".svg")
fig.write_image(save_path + ".png")
fig.show(renderer="browser")

# Assessing p-value rank "shape"

In [None]:
# lib_comparisons_to_plot = [
#     # ("newN2", "newS5"),
#     ("newN2", "newS6"),
#     # ("newN2", "newS7"),
#     ("oldN2", "oldS6"),
#     # ("sPM57", "sPM58"),
#     ("newerN2", "newerS6"),
# ]
# plot_titles = list(zip([f"{libs[0]} vs {libs[1]}" for libs in lib_comparisons_to_plot], ["(zoom)"]*len(lib_comparisons_to_plot)))
# 
# plot_titles = [item for sublist in plot_titles for item in sublist]
# 
# fig = make_subplots(rows=len(lib_comparisons_to_plot), cols=2,
#                     column_widths=[0.8, 0.2],
#                     shared_yaxes=False,
#                     shared_xaxes=True,
#                     vertical_spacing=0.1,
#                     horizontal_spacing=0.02,
#                     subplot_titles=plot_titles)
# 
# for i, (lib1, lib2) in enumerate(lib_comparisons_to_plot):
#     plt_df = filter_df.sort_values(f"{lib1}_v_{lib2}_fishers_p_value").reset_index()
#     p_values = plt_df[f'{lib1}_v_{lib2}_fishers_p_value']
#     p_value_ranks = p_values.rank()
#     non_1_max_p_value_rank = list(set(plt_df[f'{lib1}_v_{lib2}_fishers_p_value'].rank()))[-2]
#     # ^ This value is similar to the number of p-values that I am "really testing"
#     
#     # Overall plots:
#     fig.add_trace(go.Scatter(x=p_value_ranks,
#                              y=p_values,
#                              hovertext=plt_df['gene_name'],
#                              mode="markers",
#                              marker=dict(color="black"),
#                              name=lib2),
#                   row=i+1, col=1)
#     
#     # Add line for a FDR cutoff level using Benjamini-Hochberg method
#     fig.add_trace(go.Scatter(y=[(0.05 / non_1_max_p_value_rank), 0.05],
#                              x=[1, non_1_max_p_value_rank],
#                              mode="lines",
#                              marker=dict(color="red"),),
#               row=i+1, col=1)
#     
#     # Zoom plots:
#     fig.add_trace(go.Scatter(x=p_value_ranks,
#                              y=p_values,
#                              hovertext=plt_df['gene_name'],
#                              mode="markers",
#                              marker=dict(color="black"),
#                              name=lib2),
#                   row=i+1, col=2)
#     fig.add_trace(go.Scatter(y=[(0.05 / non_1_max_p_value_rank), 0.05],
#                              x=[1, non_1_max_p_value_rank],
#                              mode="lines",
#                              marker=dict(color="red"),),
#               row=i+1, col=2)
#     
#     # Name axes:
#     fig.update_yaxes(title_text=f"Fishers P-Value",
#                      row=i+1, col=1)
#     
#     # Set zoom plot axis limits:
#     fig.update_layout(**{f"yaxis{(i+1)*2}": dict(range=[0, 0.01]),
#                          f"xaxis{(i+1)*2}": dict(range=[0, 25])})
# fig.update_xaxes(title_text="Fishers P-Value Rank<br>N2 vs. smg",
#                  row=4, col=1)
# fig.update_xaxes(title_text="Fishers P-Value Rank<br>N2 vs. smg",
#                  row=4, col=2)
# fig.update_layout(title=f"Fishers P-value vs Ranking of Fishers P-value",
#                   template="plotly_white",
#                   showlegend=False)
# fig.show()

In [None]:
rank_df = filter_df.copy(deep=True).reset_index()
rank_df = rank_df.query("chr_id != 'MtDNA'")

# This was over-engineered solution, all I really want to check is smg-6 replicable-ity and new smg-5 v new smg-6
lib_combos = [("oldN2", "oldS6"), ("newerN2", "newerS6"), ("newN2", "newS6")]
for i, libs1 in enumerate(lib_combos):
    libs2 = lib_combos[(i+1) % len(lib_combos)]
    # print(libs1, libs2)
    
    # but I can still use this loop for this step:
    lib1, lib2 = libs1
    rank_df[f"{lib1}_v_{lib2}_fishers_p_value_ranked"] = rank_df[f"{lib1}_v_{lib2}_fishers_p_value"].rank()

In [None]:
lib1, lib2, lib3, lib4 = 'newerN2', 'newerS6', 'oldN2', 'oldS6'

minimum_hits_for_both = 100
p_value_cutoff_for_both = 1

plot_rank_df = rank_df.query(f"cumulative_unadapted_hits_{lib1}_{lib2} > {minimum_hits_for_both}")\
                      .query(f"cumulative_unadapted_hits_{lib3}_{lib4} > {minimum_hits_for_both}")
plot_rank_df = plot_rank_df.query(f'{lib1}_v_{lib2}_fishers_p_value < {p_value_cutoff_for_both}')\
                           .query(f'{lib3}_v_{lib4}_fishers_p_value < {p_value_cutoff_for_both}')

fig = px.scatter(plot_rank_df,
                 x=f'{lib1}_v_{lib2}_fishers_p_value_ranked',
                 y=f'{lib3}_v_{lib4}_fishers_p_value_ranked',
                 hover_name='gene_name',
                 # hover_data=[''],
                 # log_x=True, log_y=True,
                 trendline='ols',
                 title=f"Comparison of ranked p_values for {lib1}_v_{lib2} and {lib3}_v_{lib4}"
                 )
fig.show()

fig = px.scatter(plot_rank_df,
                 x=f'{lib1}_v_{lib2}_fishers_p_value',
                 y=f'{lib3}_v_{lib4}_fishers_p_value',
                 hover_name='gene_name',
                 # hover_data=[''],
                 # log_x=True, log_y=True,
                 trendline='ols',
                 title=f"Comparison of p_values for {lib1}_v_{lib2} and {lib3}_v_{lib4}"
                 )
fig.show()

Plotting comparison of p-values has helped me appreciate that the majority of genes failed to be replicated between the libraries. Some genes of interest remain highly ranked in both of the library sets:
* F19B2.5   This is a fairly interesting little gene that only has 1 isoform. It obviously has some kind of NMD dependent cleavage or degradation event happening (the coverage plots in wildtype are slopes while they're pretty square in nmd knockouts)