In [None]:
import sys
import warnings
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

from tqdm import tqdm

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

import numpy as np
import pandas as pd
import statistics as stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

# Load pre-processed files:

In [None]:
try:
    reads_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.reads_df.parquet"))
    compressed_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.compressed_df.parquet"))
    print(f"Finished library loading at {npCommon.get_dt(for_print=True)}")
except ValueError:
    reads_df_genes, compressed_df_genes = None, None
    print("Could not find pre-compressed dataframes saved. Try running the first few cells of initialTestingAndScratchPaper.ipynb")
compressed_df_genes_short = compressed_df_genes.copy()[["lib", "chr_id", "gene_id", "gene_name", "t5", "gene_hits", "gene_rpm"]]
compressed_df_genes_short.query("gene_name == 'rpl-12'")

# Format libraries as columns with shortened names

In [None]:
conversion_dict = {"xrn-1-5tera": "oldN2",
                   "xrn-1-5tera-smg-6": "oldS6",
                   "5tera_xrn-1-KD_wt": "newN2",
                   "5tera_xrn-1-KD_smg-5": "newS5",
                   "5tera_xrn-1-KD_smg-6": "newS6",
                   "5tera_xrn-1-KD_smg-7": "newS7",
                   }
ans = [y for x, y in compressed_df_genes_short.groupby(['lib', 't5'], as_index=False)]
df_dict = {}
for i, df in enumerate(ans):
    lib = df.lib.unique()[0]
    t5 = df.t5.unique()[0]
    df = df[["chr_id", "gene_id", "gene_name", "gene_hits", "gene_rpm"]]
    df = df.rename(columns={col: f'{col}_{conversion_dict[lib]}_t5{t5}' for col in df.columns if col not in ["chr_id", "gene_id", "gene_name"]})
    df_dict[(conversion_dict[lib], t5)] = df.set_index(["chr_id", "gene_id", "gene_name"])
    # print((conversion_dict[lib], t5))
    # print(df_dict[(conversion_dict[lib], t5)].query("gene_name == 'rpl-12'"))

super_df = pd.concat(df_dict.values(), axis=1, join='outer').fillna(0)
super_df

# Don't use the filtering in this step for now.

In [None]:
# This step will calculate total hits and the fraction adapted for each gene, then apply a filter for a minimum number of gene hits, adapted hits or unadapted hits

# This is currently a little problematic because it will filter all libraries based on the worst preforming library!

filter_df = pd.DataFrame()
for lib in conversion_dict.values():
    for rpm_or_hits in ["rpm", "hits"]:
        super_df[f"total_gene_{rpm_or_hits}_{lib}"] = super_df[f"gene_{rpm_or_hits}_{lib}_t5+"] + super_df[f"gene_{rpm_or_hits}_{lib}_t5-"]
    super_df[f"fraction_adapted_{lib}"] = super_df[f"gene_hits_{lib}_t5+"] / super_df[f"total_gene_hits_{lib}"]
    
    cols_to_carry_over = [col for col in super_df.columns if lib in col]
    filter_df[cols_to_carry_over] = super_df[cols_to_carry_over]
filter_df

In [None]:
from scipy.stats import chi2_contingency, chisquare, fisher_exact, boschloo_exact, barnard_exact
def row_chi2(row, target_lib_1, target_lib_2):
    array = np.array([[row[f"gene_hits_{target_lib_1}_t5-"], row[f"gene_hits_{target_lib_2}_t5-"]],
                      [row[f"gene_hits_{target_lib_1}_t5+"], row[f"gene_hits_{target_lib_2}_t5+"]]])
    try:
        chi2, p, deg_of_free, expected = chi2_contingency(array)
        return chi2, p
    except ValueError:
        return None, None

def row_fishers_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided'):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for fisher's exact test!!")
    odds_ratio, p_value = fisher_exact(array, alternative=alternative)
    return odds_ratio, p_value

def row_boschloo_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided', sampling_points=32):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for Boschloo's exact test!!")
    boschloo_result = boschloo_exact(array, alternative=alternative, n=sampling_points)
    return boschloo_result.statistic, boschloo_result.pvalue

def row_barnard_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided', sampling_points=32):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for Barnard's exact test!!")
    barnard_result = barnard_exact(array, alternative=alternative, n=sampling_points)
    return barnard_result.statistic, barnard_result.pvalue

In [None]:
# p-value setpoint and the applied cutoffs will be used for a Bonferroni correction
#   Currently no genes will be dropped based on these filters/cutoffs
base_sig_cutoff = 0.05

cumulative_min_read_cutoff = 100
filter_with_rpm_or_hits = "hits"
filter_col_target = "unadapted"  # "total" or "adapted" or "unadapted"

p_value_cutoff_dict = {}

first_lib = "newN2"
for libs in list(zip(["newN2"]*3, ["newS5",
                                   "newS6",
                                   "newS7"])) + [("oldN2", "oldS6")]:
    first_lib, second_lib = libs
    with warnings.catch_warnings():
        tqdm.pandas(desc=f"Calculating Chi Squared for {first_lib} and {second_lib}")
        filter_df[[f"{first_lib}_v_{second_lib}_chi2_test_result", f"{first_lib}_v_{second_lib}_chi2_p_value"]] = filter_df.progress_apply(lambda row: row_chi2(row, first_lib, second_lib), axis=1, result_type="expand")
        
        tqdm.pandas(desc=f"Calculating Fisher's exact for {first_lib} and {second_lib}")
        filter_df[[f"{first_lib}_v_{second_lib}_fishers_test_result", f"{first_lib}_v_{second_lib}_fishers_p_value"]] = filter_df.progress_apply(lambda row: row_fishers_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less'), axis=1, result_type="expand")
        
        # Barnard's and Boschloo tests take forever!!
        # tqdm.pandas(desc=f"Calculating Boschloo exact for {first_lib} and {second_lib}")
        # filter_df[[f"{first_lib}_v_{second_lib}_boschloo_test_result", f"{first_lib}_v_{second_lib}_boschloo_p_value"]] = filter_df.progress_apply(lambda row: row_boschloo_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less', sampling_points=4), axis=1, result_type="expand")
    
    for stat_test in ['chi2',
                      # 'barnard',
                      'boschloo',
                      'fishers',
                      ]:
        lib_cols_for_correction = []
        for lib in libs:
            filter_col_converter = {"total": f"total_gene_{filter_with_rpm_or_hits}_{lib}",
                                    "adapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5+",
                                    "unadapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5-"}
            lib_cols_for_correction.append(filter_col_converter[filter_col_target])
        cumulative_col_name = f"cumulative_{filter_col_target}_{filter_with_rpm_or_hits}_{first_lib}_{second_lib}"
        filter_df[cumulative_col_name] = filter_df[lib_cols_for_correction[0]] + filter_df[lib_cols_for_correction[1]]
        number_of_genes_passing_cutoff = filter_df[filter_df[cumulative_col_name] >= cumulative_min_read_cutoff].shape[0]
        adjusted_sig_cutoff = base_sig_cutoff / number_of_genes_passing_cutoff
        p_value_cutoff_dict[(libs, stat_test)] = (adjusted_sig_cutoff, number_of_genes_passing_cutoff)
        print(f"There were {number_of_genes_passing_cutoff} genes that passed the cutoff of having >={cumulative_min_read_cutoff} cumulative {filter_col_target} {filter_with_rpm_or_hits} between {first_lib} and {second_lib}"
              f"\n\tA Bonferroni correction with this in mind will expect a p value of {adjusted_sig_cutoff:.3g} for a significant {stat_test} test result.")
        try:
            filter_df.sort_values(f"{first_lib}_v_{second_lib}_{stat_test}_p_value")
            filter_df[f"neg_log10_{first_lib}_v_{second_lib}_{stat_test}_p_value"] = -np.log10(filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_p_value"])
            filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_significant"] = filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_p_value"] <= adjusted_sig_cutoff
        except KeyError:
            print(f"Couldn't find columns corresponding to '{stat_test}'!! Be sure spelling is correct!")
print('done.')

# Look at the top "most significant" hits

In [None]:
how_many_hits_to_save = 100
for stat_test in ["chi2", "fishers"]:
    for first_lib, second_lib in list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"])) + [("oldN2", "oldS6")]:
        temp_df = filter_df[[f"gene_rpm_{first_lib}_t5-",
                            f"gene_rpm_{first_lib}_t5+",
                            f"gene_rpm_{second_lib}_t5-",
                            f"gene_rpm_{second_lib}_t5+",
                            f"{first_lib}_v_{second_lib}_{stat_test}_test_result",
                            f"{first_lib}_v_{second_lib}_{stat_test}_p_value",
                            f"{first_lib}_v_{second_lib}_{stat_test}_significant",
                            ]].sort_values(f"{first_lib}_v_{second_lib}_{stat_test}_p_value",
                                           ascending=True).head(how_many_hits_to_save)
        print(temp_df)
        # temp_df.to_csv(f"./output_files/{npCommon.get_dt()}_{first_lib}_v_{second_lib}.{stat_test}.top{how_many_hits_to_save}.csv")

In [None]:
fig = px.scatter_matrix(filter_df.reset_index(),
                        dimensions=[col for col in filter_df if "fraction_adapted" in col],
                        labels={f"fraction_adapted_{lib}":f"Fraction Adapted <br> {lib}" for lib in conversion_dict.values()},
                        hover_name="gene_name")
fig.update_layout(template="plotly_white")
# fig.update_layout(xaxis=dict(type='log'),xaxis2=dict(type='log'),xaxis3=dict(type='log'),xaxis4=dict(type='log'))
fig.show()

# Plot relationship of fraction adapted between "replicates"

In [137]:
libs_combos_to_plot = [("oldN2", "newN2")] + [("oldS6", "newS6")]  #+ list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"])) + [("oldN2", "oldS6")]

drop_zeros = False
log_plot = True

# WE"LL HAVE THESE THINGS CARRY OVER FROM ABOVE CELL!!
# min_read_cutoff = 100
# filter_with_rpm_or_hits = "hits"
# filter_col_target = "unadapted"  # "total" or "adapted" or "unadapted"

for libs_to_plot in libs_combos_to_plot:
    first_lib, second_lib = libs_to_plot
    plot_df = filter_df[[col for col in filter_df.columns if first_lib in col or second_lib in col]]
    # I have the filter step running again down here so that we can do it more stepwise!
    for lib in libs_to_plot:
        filter_col_converter = {"total": f"total_gene_{filter_with_rpm_or_hits}_{lib}",
                                "adapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5+",
                                "unadapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5-"}
        plot_df = plot_df[plot_df[filter_col_converter[filter_col_target]] >= cumulative_min_read_cutoff]
        if drop_zeros:
            # To have the OLS work with logged data we need all values to be positive:
            plot_df = plot_df.query(f"fraction_adapted_{lib} > 0")
    if drop_zeros:
        additional_plot_params = dict(trendline_options=dict(log_x=True, log_y=True))
        additional_title_details = " (All zero values dropped)"
        additional_file_details = ".droppedZeros"
    else:
        additional_plot_params = {}
        additional_title_details = ""
        additional_file_details = ""
    if log_plot:
        additional_plot_params = {**additional_plot_params, "log_x": True, "log_y": True}
    fig = px.scatter(plot_df.reset_index(),
                     x=f"fraction_adapted_{first_lib}",
                     y=f"fraction_adapted_{second_lib}",
                     # log_x=True, log_y=True,
                     hover_name="gene_name",
                     hover_data=[col for col in filter_df.columns if first_lib in col or second_lib in col],
                     trendline="ols",
                     labels={f"fraction_adapted_{lib}":f"Fraction Adapted <br> {lib}" for lib in conversion_dict.values()},
                     **additional_plot_params)
    
    print(first_lib, second_lib)
    results = px.get_trendline_results(fig).px_fit_results.iloc[0]
    print(results.summary())
    
    fig.update_layout(template="plotly_white", title=f"<b>Fraction Adapted for {first_lib} and {second_lib}</b><br>w/ {filter_with_rpm_or_hits} cutoff of {cumulative_min_read_cutoff} across plotted libs {'&nbsp;' * 5} OLS Trendline; R<sup>2</sup>: {results.rsquared:0.4f}; adjR<sup>2</sup>: {results.rsquared_adj:0.4f} {'&nbsp;' * 5} n={plot_df.shape[0]}{additional_title_details}")
    fig.write_html(f"./output_files/{npCommon.get_dt()}_fractionAdapted_{first_lib}-v-{second_lib}_min{cumulative_min_read_cutoff}{filter_col_target}{filter_with_rpm_or_hits.title()}{additional_file_details}.scatter.html")
    fig.show()

oldN2 newN2
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.730
Method:                 Least Squares   F-statistic:                     407.3
Date:                Tue, 10 Jan 2023   Prob (F-statistic):           1.81e-44
Time:                        16:00:56   Log-Likelihood:                 389.28
No. Observations:                 151   AIC:                            -774.6
Df Residuals:                     149   BIC:                            -768.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0014      0.002     -0.

# Plot to try and decide cutoff for total/adapted/unadapted hits/rpm

In [None]:
libs_combos_to_plot = [("oldN2", "newN2")] + [("oldS6", "newS6")] + [("newN2", "newS6")] #+ list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"])) + [("oldN2", "oldS6")]
plot_with_rpm_or_hits = "hits"
plot_col_target = "unadapted"  # "total" or "adapted" or "unadapted"

def calc_stdDev_and_stdDevOverMean(values):
    std_dev = stats.stdev(values)
    mean = stats.mean(values)
    try:
        std_dev_over_mean = std_dev / mean
    except ZeroDivisionError:
        std_dev_over_mean = np.Inf
    return std_dev, mean, std_dev_over_mean

for libs_to_plot in libs_combos_to_plot:
    first_lib, second_lib = libs_to_plot
    plot_df = filter_df[[col for col in filter_df.columns if first_lib in col or second_lib in col]]
    plot_cols = []
    for lib in libs_to_plot:
        plot_col_converter = {"total": f"total_gene_{plot_with_rpm_or_hits}_{lib}",
                              "adapted": f"gene_{plot_with_rpm_or_hits}_{lib}_t5+",
                              "unadapted": f"gene_{plot_with_rpm_or_hits}_{lib}_t5-"}
        plot_cols.append(plot_col_converter[plot_col_target])
    metric_col_name = f"combined_{plot_col_target}_{plot_with_rpm_or_hits}_{first_lib}+{second_lib}"
    plot_df[metric_col_name] = plot_df[plot_cols[0]] + plot_df[plot_cols[1]]
    
    plot_df[f"fracAdapted_{first_lib}/{second_lib}"] = (plot_df[f"fraction_adapted_{first_lib}"] / plot_df[f"fraction_adapted_{second_lib}"])
    plot_df[f"log2_fracAdapted_{first_lib}/{second_lib}"] = np.log2(plot_df[f"fracAdapted_{first_lib}/{second_lib}"])
    
    plot_df[f"deltaFracAdapted_{first_lib}-{second_lib}"] = (plot_df[f"fraction_adapted_{first_lib}"] - plot_df[f"fraction_adapted_{second_lib}"])
    
    plot_df[f"fracAdapted_FC_{first_lib}/{second_lib}"] = np.log2((plot_df[f"fraction_adapted_{first_lib}"] / plot_df[f"fraction_adapted_{second_lib}"]))
    
    plot_df[[f"stdDev_fracAdapted_{first_lib}_{second_lib}", f"mean_fracAdapted_{first_lib}_{second_lib}", f"stdDevOverMean_fracAdapted_{first_lib}_{second_lib}"]] = plot_df.apply(lambda row: calc_stdDev_and_stdDevOverMean([row[f"fraction_adapted_{first_lib}"], row[f"fraction_adapted_{second_lib}"]]), axis=1, result_type="expand")
    
    fig = px.scatter(plot_df.reset_index(),
                     x=metric_col_name,
                     # =f"mean_fracAdapted_{first_lib}_{second_lib}",
                     # y=f"stdDev_fracAdapted_{first_lib}_{second_lib}",
                     y=f"deltaFracAdapted_{first_lib}-{second_lib}",
                     color="chr_id",
                     log_x=True,
                     hover_name="gene_name",
                     hover_data=["gene_id", "chr_id"] + [col for col in filter_df.columns if first_lib in col or second_lib in col])
    
    fig.update_layout(template="plotly_white")
    fig.write_html(f"./output_files/{npCommon.get_dt()}_deltaFracAdapted-v-combined{plot_col_target.title()}{plot_with_rpm_or_hits.title()}_{first_lib}-{second_lib}.scatter.html")
    fig.write_image(f"./output_files/{npCommon.get_dt()}_deltaFracAdapted-v-combined{plot_col_target.title()}{plot_with_rpm_or_hits.title()}_{first_lib}-{second_lib}.scatter.png")
    fig.show()
    print(plot_df.shape)

# Plot fraction adapted fold change and Chi2/Fishers p-values

In [None]:
from plotly.subplots import make_subplots

stat_test = "chi2"  # "fishers" or "chi2"

plot_df = filter_df.copy(deep=True)
for first_lib, second_lib in [("newN2", "newS5"),
                              ("newN2", "newS6"),
                              ("newN2", "newS7"),
                              ("oldN2", "oldS6")]:
    plot_df[f"fracAdapted_FC_{first_lib}/{second_lib}"] = np.log2(plot_df[f"fraction_adapted_{first_lib}"] / plot_df[f"fraction_adapted_{second_lib}"])
plot_df = plot_df.reset_index()
plot_df = plot_df[plot_df.chr_id != 'MtDNA']
plot_df = plot_df.replace({-np.inf: -6, np.inf: 6})


fig = make_subplots(rows=1, cols=4,
                    shared_yaxes=True,
                    shared_xaxes=True,
                    vertical_spacing=0.02,
                    horizontal_spacing=0.02,
                    subplot_titles=["Old N2 vs. smg-6",
                                    "New N2 vs. smg-5",
                                    "New N2 vs. smg-6",
                                    "New N2 vs. smg-7"])

fig.add_trace(go.Scatter(
    x=plot_df["fracAdapted_FC_oldN2/oldS6"],
    y=plot_df[f"neg_log10_oldN2_v_oldS6_{stat_test}_p_value"],
    marker_color=plot_df[f"oldN2_v_oldS6_{stat_test}_significant"].replace(
        {False: "black",
         True: "red"}
    ),
    hovertext=plot_df["gene_name"],
    mode="markers",
    name="smg-6"),
              row=1, col=1)

cutoff, num_passed_genes = p_value_cutoff_dict[(("oldN2", "oldS6"), "fishers")]
fig.add_hline(y=-np.log10(cutoff),
              row=1, col=1)
fig.add_annotation(text=f"Bonferroni adjusted<br>p-value = {cutoff:0.3e}",
                   yref="y",
                   y=-np.log10(cutoff),
                   xref="x domain",
                   x=0,
                   yanchor="bottom",
                   showarrow=False,
                   align="left")
fig.add_annotation(text=f"<b>genes w/ > 100 {filter_col_target} {filter_with_rpm_or_hits} = {num_passed_genes}",
                  yref="y domain",
                  y=0,
                  xref="x domain",
                  x=0.5,
                  yanchor="bottom",
                  showarrow=False,
                  align="center",
                  row=1, col=1)

for i, smg in enumerate([5, 6, 7]):
    fig.add_trace(go.Scatter(
        x=plot_df[f"fracAdapted_FC_newN2/newS{smg}"],
        y=plot_df[f"neg_log10_newN2_v_newS{smg}_{stat_test}_p_value"],
        marker_color=plot_df[f"newN2_v_newS{smg}_{stat_test}_significant"].replace({False: "black", True: "red"}),
        hovertext=plot_df["gene_name"],
        mode="markers",
        name=f"smg-{smg}"),
                  row=1, col=i+2)
    
    cutoff, num_passed_genes = p_value_cutoff_dict[(("newN2", f"newS{smg}"), "fishers")]
    fig.add_hline(y=-np.log10(cutoff),
              row=1, col=i+2)
    fig.add_annotation(text=f"Bonferroni<br>p-value = {cutoff:0.3e}",
                       yref="y",
                       y=-np.log10(cutoff),
                       xref="x domain",
                       x=0,
                       yanchor="bottom",
                       showarrow=False,
                       align="left",
                       row=1, col=i+2)
    fig.add_annotation(text=f"<b>genes w/ > 100 {filter_col_target} {filter_with_rpm_or_hits} = {num_passed_genes}",
                   yref="y domain",
                   y=0,
                   xref="x domain",
                   x=0.5,
                   yanchor="bottom",
                   showarrow=False,
                   align="center",
                   row=1, col=i+2)



fig.update_layout(template="plotly_white",
                  title=f"<b>FC Frac. Adapted vs. {stat_test.title()} Test P-Values</b>"
                        f"<br>Cumulative {filter_col_target} {filter_with_rpm_or_hits} for Cutoff = {cumulative_min_read_cutoff}")

fig.update_xaxes(tickvals=["-6", "-4", "-2", "0", "2", "4", "6"],
                 ticktext=["-INF", "-4", "-2", "0", "2", "4", "INF"])

fig.update_yaxes(title_text=f"-log<sub>10</sub> {stat_test.title()} Test P-Value",
                 row=1, col=1)
fig.update_xaxes(title_text="Fold Change Fraction Adapted<br>N2 vs. smg")
fig.write_html(f"./output_files/{npCommon.get_dt()}_{stat_test}Pvalues-v-FCfractionAdapted.scatters.html")
fig.show()
plot_df.query("gene_name == 'F19B2.5'").T

# Assessing p-value rank "shape"

In [172]:
fig = make_subplots(rows=4, cols=2,
                    column_widths=[0.8, 0.2],
                    shared_yaxes=False,
                    shared_xaxes=True,
                    vertical_spacing=0.1,
                    horizontal_spacing=0.02,
                    subplot_titles=["Old N2 vs. smg-6", "(zoom)",
                                    "New N2 vs. smg-5", "(zoom)",
                                    "New N2 vs. smg-6", "(zoom)",
                                    "New N2 vs. smg-7", "(zoom)",])

for i, (lib1, lib2) in enumerate([("oldN2", "oldS6"),
                                  ("newN2", "newS5"),
                                  ("newN2", "newS6"),
                                  ("newN2", "newS7")]):
    plt_df = filter_df.sort_values(f"{lib1}_v_{lib2}_fishers_p_value").reset_index()
    p_values = plt_df[f'{lib1}_v_{lib2}_fishers_p_value']
    p_value_ranks = p_values.rank()
    non_1_max_p_value_rank = list(set(plt_df[f'{lib1}_v_{lib2}_fishers_p_value'].rank()))[-2]
    # ^ This value is similar to the number of p-values that I am "really testing"
    
    # Overall plots:
    fig.add_trace(go.Scatter(x=p_value_ranks,
                             y=p_values,
                             hovertext=plt_df['gene_name'],
                             mode="markers",
                             marker=dict(color="black"),
                             name=lib2),
                  row=i+1, col=1)
    fig.add_trace(go.Scatter(y=[(0.05 / non_1_max_p_value_rank), 0.05],
                             x=[1, non_1_max_p_value_rank],
                             mode="lines",
                             marker=dict(color="red"),),
              row=i+1, col=1)
    
    # Zoom plots:
    fig.add_trace(go.Scatter(x=p_value_ranks,
                             y=p_values,
                             hovertext=plt_df['gene_name'],
                             mode="markers",
                             marker=dict(color="black"),
                             name=lib2),
                  row=i+1, col=2)
    fig.add_trace(go.Scatter(y=[(0.05 / non_1_max_p_value_rank), 0.05],
                             x=[1, non_1_max_p_value_rank],
                             mode="lines",
                             marker=dict(color="red"),),
              row=i+1, col=2)
    
    # Name axes:
    fig.update_yaxes(title_text=f"Fishers P-Value",
                     row=i+1, col=1)
    
    # Set zoom plot axis limits:
    fig.update_layout(**{f"yaxis{(i+1)*2}": dict(range=[0, 0.01]),
                         f"xaxis{(i+1)*2}": dict(range=[0, 25])})
fig.update_xaxes(title_text="Fishers P-Value Rank<br>N2 vs. smg",
                 row=4, col=1)
fig.update_xaxes(title_text="Fishers P-Value Rank<br>N2 vs. smg",
                 row=4, col=2)
fig.update_layout(title=f"Fishers P-value vs Ranking of Fishers P-value",
                  template="plotly_white",
                  showlegend=False)
fig.show()

In [208]:
rank_df = filter_df.copy(deep=True).reset_index()
rank_df = rank_df.query("chr_id != 'MtDNA'")

# This was over-engineered solution, all I really want to check is smg-6 replicable-ity and new smg-5 v new smg-6
lib_combos = [("oldN2", "oldS6"), ("newN2", "newS5"), ("newN2", "newS6"), ("newN2", "newS7")]
for i, libs1 in enumerate(lib_combos):
    libs2 = lib_combos[(i+1) % len(lib_combos)]
    print(libs1, libs2)
    
    # but I can still use this loop for this step:
    lib1, lib2 = libs1
    rank_df[f"{lib1}_v_{lib2}_fishers_p_value_ranked"] = rank_df[f"{lib1}_v_{lib2}_fishers_p_value"].rank()

lib1, lib2, lib3, lib4 = 'newN2', 'newS6', 'oldN2', 'oldS6'

plot_rank_df = rank_df.query(f"cummulative_unadapted_hits_{lib1}_{lib2} > 100").query(f"cummulative_unadapted_hits_{lib3}_{lib4} > 100")
plot_rank_df = plot_rank_df.query(f'{lib1}_v_{lib2}_fishers_p_value < 1').query(f'{lib3}_v_{lib4}_fishers_p_value < 1')

fig = px.scatter(plot_rank_df,
                 x=f'{lib1}_v_{lib2}_fishers_p_value_ranked',
                 y=f'{lib3}_v_{lib4}_fishers_p_value_ranked',
                 hover_name='gene_name',
                 # hover_data=[''],
                 # log_x=True, log_y=True,
                 trendline='ols',
                 )
fig.show()

('oldN2', 'oldS6') ('newN2', 'newS5')
('newN2', 'newS5') ('newN2', 'newS6')
('newN2', 'newS6') ('newN2', 'newS7')
('newN2', 'newS7') ('oldN2', 'oldS6')
