In [17]:
import sys
import warnings
sys.path.insert(0, '/data16/marcus/scripts/nanoporePipelineScripts')
import nanoporePipelineCommon as npCommon

from tqdm import tqdm

import seaborn as sea
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

import numpy as np
import pandas as pd
import statistics as stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

print(f"Imports done at {npCommon.get_dt(for_print=True)}")

Imports done at 01/24/23 @ 06:16:44 PM


# Load pre-processed files:

In [18]:
try:
    reads_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.reads_df.parquet"))
    compressed_df_genes = pd.read_parquet(npCommon.find_newest_matching_file(f"./output_files/*_quad5TERA.compressed_df.parquet"))
    print(f"Finished library loading at {npCommon.get_dt(for_print=True)}")
except ValueError:
    reads_df_genes, compressed_df_genes = None, None
    print("Could not find pre-compressed dataframes saved. Try running the first few cells of initialTestingAndScratchPaper.ipynb")
compressed_df_genes_short = compressed_df_genes.copy()[["lib", "chr_id", "gene_id", "gene_name", "t5", "gene_hits", "gene_rpm"]]
compressed_df_genes_short.query("gene_name == 'rpl-12'")

Finished library loading at 01/24/23 @ 06:16:47 PM


Unnamed: 0,lib,chr_id,gene_id,gene_name,t5,gene_hits,gene_rpm
6470,5tera_xrn-1-KD_smg-5,IV,WBGene00004424,rpl-12,-,880,6637.101397
6471,5tera_xrn-1-KD_smg-5,IV,WBGene00004424,rpl-12,+,7,52.795125
16600,5tera_xrn-1-KD_smg-6,IV,WBGene00004424,rpl-12,-,420,5932.706162
16601,5tera_xrn-1-KD_smg-6,IV,WBGene00004424,rpl-12,+,5,70.627454
27164,5tera_xrn-1-KD_smg-7,IV,WBGene00004424,rpl-12,-,691,4533.496041
27165,5tera_xrn-1-KD_smg-7,IV,WBGene00004424,rpl-12,+,23,150.897842
38673,5tera_xrn-1-KD_wt,IV,WBGene00004424,rpl-12,-,437,4214.851323
38674,5tera_xrn-1-KD_wt,IV,WBGene00004424,rpl-12,+,10,96.449687
53625,xrn-1-5tera,IV,WBGene00004424,rpl-12,-,1759,3710.19555
53626,xrn-1-5tera,IV,WBGene00004424,rpl-12,+,49,103.353941


# Format libraries as columns with shortened names

In [19]:
conversion_dict = {"xrn-1-5tera": "oldN2",
                   "xrn-1-5tera-smg-6": "oldS6",
                   "5tera_xrn-1-KD_wt": "newN2",
                   "5tera_xrn-1-KD_smg-5": "newS5",
                   "5tera_xrn-1-KD_smg-6": "newS6",
                   "5tera_xrn-1-KD_smg-7": "newS7",
                   }
ans = [y for x, y in compressed_df_genes_short.groupby(['lib', 't5'], as_index=False)]
df_dict = {}
for i, df in enumerate(ans):
    lib = df.lib.unique()[0]
    t5 = df.t5.unique()[0]
    df = df[["chr_id", "gene_id", "gene_name", "gene_hits", "gene_rpm"]]
    df = df.rename(columns={col: f'{col}_{conversion_dict[lib]}_t5{t5}' for col in df.columns if col not in ["chr_id", "gene_id", "gene_name"]})
    df_dict[(conversion_dict[lib], t5)] = df.set_index(["chr_id", "gene_id", "gene_name"])
    # print((conversion_dict[lib], t5))
    # print(df_dict[(conversion_dict[lib], t5)].query("gene_name == 'rpl-12'"))

super_df = pd.concat(df_dict.values(), axis=1, join='outer').fillna(0)
super_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gene_hits_newS5_t5+,gene_rpm_newS5_t5+,gene_hits_newS5_t5-,gene_rpm_newS5_t5-,gene_hits_newS6_t5+,gene_rpm_newS6_t5+,gene_hits_newS6_t5-,gene_rpm_newS6_t5-,gene_hits_newS7_t5+,gene_rpm_newS7_t5+,gene_hits_newS7_t5-,gene_rpm_newS7_t5-,gene_hits_newN2_t5+,gene_rpm_newN2_t5+,gene_hits_newN2_t5-,gene_rpm_newN2_t5-,gene_hits_oldN2_t5+,gene_rpm_oldN2_t5+,gene_hits_oldN2_t5-,gene_rpm_oldN2_t5-,gene_hits_oldS6_t5+,gene_rpm_oldS6_t5+,gene_hits_oldS6_t5-,gene_rpm_oldS6_t5-
chr_id,gene_id,gene_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
I,WBGene00000158,apg-1,1.0,7.542161,5.0,37.710803,0.0,0.000000,2.0,28.250982,0.0,0.000000,15.0,98.411636,0.0,0.0,4.0,38.579875,1.0,2.109264,32.0,67.496451,0.0,0.000000,4.0,24.644199
I,WBGene00000230,atp-3,1.0,7.542161,143.0,1078.528977,2.0,28.250982,83.0,1172.415741,5.0,32.803879,143.0,938.190932,0.0,0.0,93.0,896.982089,10.0,21.092641,536.0,1130.565557,4.0,24.644199,106.0,653.071283
I,WBGene00000474,cey-3,1.0,7.542161,32.0,241.349142,0.0,0.000000,9.0,127.129418,1.0,6.560776,30.0,196.823272,0.0,0.0,53.0,511.183341,8.0,16.874113,265.0,558.954986,1.0,6.161050,58.0,357.340891
I,WBGene00000500,chn-1,1.0,7.542161,3.0,22.626482,0.0,0.000000,2.0,28.250982,0.0,0.000000,7.0,45.925430,0.0,0.0,3.0,28.934906,2.0,4.218528,31.0,65.387187,0.0,0.000000,14.0,86.254698
I,WBGene00000625,col-48,1.0,7.542161,38.0,286.602106,0.0,0.000000,5.0,70.627454,1.0,6.560776,29.0,190.262497,0.0,0.0,39.0,376.153779,1.0,2.109264,54.0,113.900261,0.0,0.000000,32.0,197.153595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X,WBGene00044135,F08G12.11,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.0,6.161050
X,WBGene00045291,F23D12.10,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,2.0,12.322100
X,WBGene00077592,T25B2.3,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.0,6.161050
X,WBGene00194983,T25B2.4,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,1.0,6.161050


# Don't use the filtering in this step for now.

In [20]:
# This step will calculate total hits and the fraction adapted for each gene, then apply a filter for a minimum number of gene hits, adapted hits or unadapted hits

# This is currently a little problematic because it will filter all libraries based on the worst preforming library!

filter_df = pd.DataFrame()
for lib in conversion_dict.values():
    for rpm_or_hits in ["rpm", "hits"]:
        super_df[f"total_gene_{rpm_or_hits}_{lib}"] = super_df[f"gene_{rpm_or_hits}_{lib}_t5+"] + super_df[f"gene_{rpm_or_hits}_{lib}_t5-"]
    super_df[f"fraction_adapted_{lib}"] = super_df[f"gene_hits_{lib}_t5+"] / super_df[f"total_gene_hits_{lib}"]
    
    cols_to_carry_over = [col for col in super_df.columns if lib in col]
    filter_df[cols_to_carry_over] = super_df[cols_to_carry_over]
filter_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gene_hits_oldN2_t5+,gene_rpm_oldN2_t5+,gene_hits_oldN2_t5-,gene_rpm_oldN2_t5-,total_gene_rpm_oldN2,total_gene_hits_oldN2,fraction_adapted_oldN2,gene_hits_oldS6_t5+,gene_rpm_oldS6_t5+,gene_hits_oldS6_t5-,gene_rpm_oldS6_t5-,total_gene_rpm_oldS6,total_gene_hits_oldS6,fraction_adapted_oldS6,gene_hits_newN2_t5+,gene_rpm_newN2_t5+,gene_hits_newN2_t5-,gene_rpm_newN2_t5-,total_gene_rpm_newN2,total_gene_hits_newN2,fraction_adapted_newN2,gene_hits_newS5_t5+,gene_rpm_newS5_t5+,gene_hits_newS5_t5-,gene_rpm_newS5_t5-,total_gene_rpm_newS5,total_gene_hits_newS5,fraction_adapted_newS5,gene_hits_newS6_t5+,gene_rpm_newS6_t5+,gene_hits_newS6_t5-,gene_rpm_newS6_t5-,total_gene_rpm_newS6,total_gene_hits_newS6,fraction_adapted_newS6,gene_hits_newS7_t5+,gene_rpm_newS7_t5+,gene_hits_newS7_t5-,gene_rpm_newS7_t5-,total_gene_rpm_newS7,total_gene_hits_newS7,fraction_adapted_newS7
chr_id,gene_id,gene_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
I,WBGene00000158,apg-1,1.0,2.109264,32.0,67.496451,69.605715,33.0,0.030303,0.0,0.000000,4.0,24.644199,24.644199,4.0,0.000000,0.0,0.0,4.0,38.579875,38.579875,4.0,0.0,1.0,7.542161,5.0,37.710803,45.252964,6.0,0.166667,0.0,0.000000,2.0,28.250982,28.250982,2.0,0.000000,0.0,0.000000,15.0,98.411636,98.411636,15.0,0.000000
I,WBGene00000230,atp-3,10.0,21.092641,536.0,1130.565557,1151.658198,546.0,0.018315,4.0,24.644199,106.0,653.071283,677.715483,110.0,0.036364,0.0,0.0,93.0,896.982089,896.982089,93.0,0.0,1.0,7.542161,143.0,1078.528977,1086.071138,144.0,0.006944,2.0,28.250982,83.0,1172.415741,1200.666723,85.0,0.023529,5.0,32.803879,143.0,938.190932,970.994810,148.0,0.033784
I,WBGene00000474,cey-3,8.0,16.874113,265.0,558.954986,575.829099,273.0,0.029304,1.0,6.161050,58.0,357.340891,363.501941,59.0,0.016949,0.0,0.0,53.0,511.183341,511.183341,53.0,0.0,1.0,7.542161,32.0,241.349142,248.891302,33.0,0.030303,0.0,0.000000,9.0,127.129418,127.129418,9.0,0.000000,1.0,6.560776,30.0,196.823272,203.384048,31.0,0.032258
I,WBGene00000500,chn-1,2.0,4.218528,31.0,65.387187,69.605715,33.0,0.060606,0.0,0.000000,14.0,86.254698,86.254698,14.0,0.000000,0.0,0.0,3.0,28.934906,28.934906,3.0,0.0,1.0,7.542161,3.0,22.626482,30.168643,4.0,0.250000,0.0,0.000000,2.0,28.250982,28.250982,2.0,0.000000,0.0,0.000000,7.0,45.925430,45.925430,7.0,0.000000
I,WBGene00000625,col-48,1.0,2.109264,54.0,113.900261,116.009525,55.0,0.018182,0.0,0.000000,32.0,197.153595,197.153595,32.0,0.000000,0.0,0.0,39.0,376.153779,376.153779,39.0,0.0,1.0,7.542161,38.0,286.602106,294.144266,39.0,0.025641,0.0,0.000000,5.0,70.627454,70.627454,5.0,0.000000,1.0,6.560776,29.0,190.262497,196.823272,30.0,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X,WBGene00044135,F08G12.11,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,1.0,6.161050,6.161050,1.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,
X,WBGene00045291,F23D12.10,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,2.0,12.322100,12.322100,2.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,
X,WBGene00077592,T25B2.3,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,1.0,6.161050,6.161050,1.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,
X,WBGene00194983,T25B2.4,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,1.0,6.161050,6.161050,1.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.000000,0.0,0.000000,0.000000,0.0,


In [21]:
from scipy.stats import chi2_contingency, chisquare, fisher_exact, boschloo_exact, barnard_exact
def row_chi2(row, target_lib_1, target_lib_2):
    array = np.array([[row[f"gene_hits_{target_lib_1}_t5-"], row[f"gene_hits_{target_lib_2}_t5-"]],
                      [row[f"gene_hits_{target_lib_1}_t5+"], row[f"gene_hits_{target_lib_2}_t5+"]]])
    try:
        chi2, p, deg_of_free, expected = chi2_contingency(array)
        return chi2, p
    except ValueError:
        return None, None

def row_fishers_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided'):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for fisher's exact test!!")
    odds_ratio, p_value = fisher_exact(array, alternative=alternative)
    return odds_ratio, p_value

def row_boschloo_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided', sampling_points=32):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for Boschloo's exact test!!")
    boschloo_result = boschloo_exact(array, alternative=alternative, n=sampling_points)
    return boschloo_result.statistic, boschloo_result.pvalue

def row_barnard_exact(row, target_lib_1, target_lib_2, hits_or_rpm='hits', alternative='two-sided', sampling_points=32):
    array = np.array([[row[f"gene_{hits_or_rpm}_{target_lib_1}_t5-"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5-"]],
                      [row[f"gene_{hits_or_rpm}_{target_lib_1}_t5+"], row[f"gene_{hits_or_rpm}_{target_lib_2}_t5+"]]])
    if alternative not in ['two-sided', 'greater', 'less']:
        raise KeyError(f"Please use 'two-sided', 'greater', or 'less' for the alternative hypothesis input for Barnard's exact test!!")
    barnard_result = barnard_exact(array, alternative=alternative, n=sampling_points)
    return barnard_result.statistic, barnard_result.pvalue

In [22]:
# p-value setpoint and the applied cutoffs will be used for a Bonferroni correction
#   Currently no genes will be dropped based on these filters/cutoffs
base_sig_cutoff = 0.05

cumulative_min_read_cutoff = 100
filter_with_rpm_or_hits = "hits"
filter_col_target = "unadapted"  # "total" or "adapted" or "unadapted"

p_value_cutoff_dict = {}

first_lib = "newN2"
for libs in list(zip(["newN2"]*3, ["newS5",
                                   "newS6",
                                   "newS7"])) + [("oldN2", "oldS6")]:
    first_lib, second_lib = libs
    with warnings.catch_warnings():
        tqdm.pandas(desc=f"Calculating Chi Squared for {first_lib} and {second_lib}")
        filter_df[[f"{first_lib}_v_{second_lib}_chi2_test_result", f"{first_lib}_v_{second_lib}_chi2_p_value"]] = filter_df.progress_apply(lambda row: row_chi2(row, first_lib, second_lib), axis=1, result_type="expand")
        
        tqdm.pandas(desc=f"Calculating Fisher's exact for {first_lib} and {second_lib}")
        filter_df[[f"{first_lib}_v_{second_lib}_fishers_test_result", f"{first_lib}_v_{second_lib}_fishers_p_value"]] = filter_df.progress_apply(lambda row: row_fishers_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less'), axis=1, result_type="expand")
        
        # Barnard's and Boschloo tests take forever!!
        # tqdm.pandas(desc=f"Calculating Boschloo exact for {first_lib} and {second_lib}")
        # filter_df[[f"{first_lib}_v_{second_lib}_boschloo_test_result", f"{first_lib}_v_{second_lib}_boschloo_p_value"]] = filter_df.progress_apply(lambda row: row_boschloo_exact(row, first_lib, second_lib, hits_or_rpm='hits', alternative='less', sampling_points=4), axis=1, result_type="expand")
    
    for stat_test in ['chi2',
                      # 'barnard',
                      'boschloo',
                      'fishers',
                      ]:
        lib_cols_for_correction = []
        for lib in libs:
            filter_col_converter = {"total": f"total_gene_{filter_with_rpm_or_hits}_{lib}",
                                    "adapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5+",
                                    "unadapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5-"}
            lib_cols_for_correction.append(filter_col_converter[filter_col_target])
        cumulative_col_name = f"cumulative_{filter_col_target}_{filter_with_rpm_or_hits}_{first_lib}_{second_lib}"
        filter_df[cumulative_col_name] = filter_df[lib_cols_for_correction[0]] + filter_df[lib_cols_for_correction[1]]
        number_of_genes_passing_cutoff = filter_df[filter_df[cumulative_col_name] >= cumulative_min_read_cutoff].shape[0]
        adjusted_sig_cutoff = base_sig_cutoff / number_of_genes_passing_cutoff
        p_value_cutoff_dict[(libs, stat_test)] = (adjusted_sig_cutoff, number_of_genes_passing_cutoff)
        print(f"There were {number_of_genes_passing_cutoff} genes that passed the cutoff of having >={cumulative_min_read_cutoff} cumulative {filter_col_target} {filter_with_rpm_or_hits} between {first_lib} and {second_lib}"
              f"\n\tA Bonferroni correction with this in mind will expect a p value of {adjusted_sig_cutoff:.3g} for a significant {stat_test} test result.")
        try:
            filter_df.sort_values(f"{first_lib}_v_{second_lib}_{stat_test}_p_value")
            filter_df[f"neg_log10_{first_lib}_v_{second_lib}_{stat_test}_p_value"] = -np.log10(filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_p_value"])
            filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_significant"] = filter_df[f"{first_lib}_v_{second_lib}_{stat_test}_p_value"] <= adjusted_sig_cutoff
        except KeyError:
            print(f"Couldn't find columns corresponding to '{stat_test}'!! Be sure spelling is correct!")
print('done.')


invalid value encountered in divide

Calculating Chi Squared for newN2 and newS5: 100%|██████████| 14717/14717 [00:01<00:00, 9844.93it/s] 
Calculating Fisher's exact for newN2 and newS5: 100%|██████████| 14717/14717 [00:00<00:00, 20016.59it/s]


There were 343 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS5
	A Bonferroni correction with this in mind will expect a p value of 0.000146 for a significant chi2 test result.
There were 343 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS5
	A Bonferroni correction with this in mind will expect a p value of 0.000146 for a significant boschloo test result.
Couldn't find columns corresponding to 'boschloo'!! Be sure spelling is correct!
There were 343 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS5
	A Bonferroni correction with this in mind will expect a p value of 0.000146 for a significant fishers test result.



invalid value encountered in divide

Calculating Chi Squared for newN2 and newS6: 100%|██████████| 14717/14717 [00:01<00:00, 9322.29it/s] 
Calculating Fisher's exact for newN2 and newS6: 100%|██████████| 14717/14717 [00:00<00:00, 20459.11it/s]


There were 254 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS6
	A Bonferroni correction with this in mind will expect a p value of 0.000197 for a significant chi2 test result.
There were 254 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS6
	A Bonferroni correction with this in mind will expect a p value of 0.000197 for a significant boschloo test result.
Couldn't find columns corresponding to 'boschloo'!! Be sure spelling is correct!
There were 254 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS6
	A Bonferroni correction with this in mind will expect a p value of 0.000197 for a significant fishers test result.



invalid value encountered in divide

Calculating Chi Squared for newN2 and newS7: 100%|██████████| 14717/14717 [00:01<00:00, 9058.86it/s] 
Calculating Fisher's exact for newN2 and newS7: 100%|██████████| 14717/14717 [00:00<00:00, 19179.58it/s]


There were 360 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS7
	A Bonferroni correction with this in mind will expect a p value of 0.000139 for a significant chi2 test result.
There were 360 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS7
	A Bonferroni correction with this in mind will expect a p value of 0.000139 for a significant boschloo test result.
Couldn't find columns corresponding to 'boschloo'!! Be sure spelling is correct!
There were 360 genes that passed the cutoff of having >=100 cumulative unadapted hits between newN2 and newS7
	A Bonferroni correction with this in mind will expect a p value of 0.000139 for a significant fishers test result.



invalid value encountered in divide

Calculating Chi Squared for oldN2 and oldS6: 100%|██████████| 14717/14717 [00:01<00:00, 8288.24it/s] 
Calculating Fisher's exact for oldN2 and oldS6: 100%|██████████| 14717/14717 [00:01<00:00, 11550.37it/s]

There were 830 genes that passed the cutoff of having >=100 cumulative unadapted hits between oldN2 and oldS6
	A Bonferroni correction with this in mind will expect a p value of 6.02e-05 for a significant chi2 test result.
There were 830 genes that passed the cutoff of having >=100 cumulative unadapted hits between oldN2 and oldS6
	A Bonferroni correction with this in mind will expect a p value of 6.02e-05 for a significant boschloo test result.
Couldn't find columns corresponding to 'boschloo'!! Be sure spelling is correct!
There were 830 genes that passed the cutoff of having >=100 cumulative unadapted hits between oldN2 and oldS6
	A Bonferroni correction with this in mind will expect a p value of 6.02e-05 for a significant fishers test result.
done.





# Look at the top "most significant" hits

In [23]:
how_many_hits_to_save = 100
for stat_test in ["chi2", "fishers"]:
    for first_lib, second_lib in list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"])) + [("oldN2", "oldS6")]:
        temp_df = filter_df[[f"gene_rpm_{first_lib}_t5-",
                            f"gene_rpm_{first_lib}_t5+",
                            f"gene_rpm_{second_lib}_t5-",
                            f"gene_rpm_{second_lib}_t5+",
                            f"{first_lib}_v_{second_lib}_{stat_test}_test_result",
                            f"{first_lib}_v_{second_lib}_{stat_test}_p_value",
                            f"{first_lib}_v_{second_lib}_{stat_test}_significant",
                            ]].sort_values(f"{first_lib}_v_{second_lib}_{stat_test}_p_value",
                                           ascending=True).head(how_many_hits_to_save)
        print(temp_df)
        # temp_df.to_csv(f"./output_files/{npCommon.get_dt()}_{first_lib}_v_{second_lib}.{stat_test}.top{how_many_hits_to_save}.csv")

                                 gene_rpm_newN2_t5-  gene_rpm_newN2_t5+  gene_rpm_newS5_t5-  gene_rpm_newS5_t5+  newN2_v_newS5_chi2_test_result  newN2_v_newS5_chi2_p_value  \
chr_id gene_id        gene_name                                                                                                                                               
V      WBGene00008944 F19B2.5            270.059124          115.739624         1116.239780           37.710803                       24.980647                5.790867e-07   
X      WBGene00000755 col-182            183.254405          163.964468          248.891302            0.000000                       18.212818                1.975449e-05   
MtDNA  WBGene00010966 nduo-3            7590.590369         2980.295329         7738.256856         4185.899176                       13.828959                2.002262e-04   
II     WBGene00015894 acdh-2              86.804718           38.579875          708.963104           15.084321              

In [24]:
fig = px.scatter_matrix(filter_df.reset_index(),
                        dimensions=[col for col in filter_df if "fraction_adapted" in col],
                        labels={f"fraction_adapted_{lib}":f"Fraction Adapted <br> {lib}" for lib in conversion_dict.values()},
                        hover_name="gene_name")
fig.update_layout(template="plotly_white")
# fig.update_layout(xaxis=dict(type='log'),xaxis2=dict(type='log'),xaxis3=dict(type='log'),xaxis4=dict(type='log'))
fig.show()


iteritems is deprecated and will be removed in a future version. Use .items instead.



# Plot relationship of fraction adapted between "replicates"

In [25]:
libs_combos_to_plot = [("oldN2", "newN2")] + [("oldS6", "newS6")]  #+ list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"])) + [("oldN2", "oldS6")]

drop_zeros = False
log_plot = True

# WE"LL HAVE THESE THINGS CARRY OVER FROM ABOVE CELL!!
# min_read_cutoff = 100
# filter_with_rpm_or_hits = "hits"
# filter_col_target = "unadapted"  # "total" or "adapted" or "unadapted"

for libs_to_plot in libs_combos_to_plot:
    first_lib, second_lib = libs_to_plot
    plot_df = filter_df[[col for col in filter_df.columns if first_lib in col or second_lib in col]]
    # I have the filter step running again down here so that we can do it more stepwise!
    for lib in libs_to_plot:
        filter_col_converter = {"total": f"total_gene_{filter_with_rpm_or_hits}_{lib}",
                                "adapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5+",
                                "unadapted": f"gene_{filter_with_rpm_or_hits}_{lib}_t5-"}
        plot_df = plot_df[plot_df[filter_col_converter[filter_col_target]] >= cumulative_min_read_cutoff]
        if drop_zeros:
            # To have the OLS work with logged data we need all values to be positive:
            plot_df = plot_df.query(f"fraction_adapted_{lib} > 0")
    if drop_zeros:
        additional_plot_params = dict(trendline_options=dict(log_x=True, log_y=True))
        additional_title_details = " (All zero values dropped)"
        additional_file_details = ".droppedZeros"
    else:
        additional_plot_params = {}
        additional_title_details = ""
        additional_file_details = ""
    if log_plot:
        additional_plot_params = {**additional_plot_params, "log_x": True, "log_y": True}
    fig = px.scatter(plot_df.reset_index(),
                     x=f"fraction_adapted_{first_lib}",
                     y=f"fraction_adapted_{second_lib}",
                     # log_x=True, log_y=True,
                     hover_name="gene_name",
                     hover_data=[col for col in filter_df.columns if first_lib in col or second_lib in col],
                     trendline="ols",
                     labels={f"fraction_adapted_{lib}":f"Fraction Adapted <br> {lib}" for lib in conversion_dict.values()},
                     **additional_plot_params)
    
    print(first_lib, second_lib)
    results = px.get_trendline_results(fig).px_fit_results.iloc[0]
    print(results.summary())
    
    fig.update_layout(template="plotly_white", title=f"<b>Fraction Adapted for {first_lib} and {second_lib}</b><br>w/ {filter_with_rpm_or_hits} cutoff of {cumulative_min_read_cutoff} across plotted libs {'&nbsp;' * 5} OLS Trendline; R<sup>2</sup>: {results.rsquared:0.4f}; adjR<sup>2</sup>: {results.rsquared_adj:0.4f} {'&nbsp;' * 5} n={plot_df.shape[0]}{additional_title_details}")
    fig.write_html(f"./output_files/{npCommon.get_dt()}_fractionAdapted_{first_lib}-v-{second_lib}_min{cumulative_min_read_cutoff}{filter_col_target}{filter_with_rpm_or_hits.title()}{additional_file_details}.scatter.html")
    fig.show()

oldN2 newN2
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.732
Model:                            OLS   Adj. R-squared:                  0.730
Method:                 Least Squares   F-statistic:                     407.3
Date:                Tue, 24 Jan 2023   Prob (F-statistic):           1.81e-44
Time:                        18:16:59   Log-Likelihood:                 389.28
No. Observations:                 151   AIC:                            -774.6
Df Residuals:                     149   BIC:                            -768.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0014      0.002     -0.

# Plot to try and decide cutoff for total/adapted/unadapted hits/rpm

In [26]:
libs_combos_to_plot = [("oldN2", "newN2")] + [("oldS6", "newS6")] + [("newN2", "newS6")] #+ list(zip(["newN2"]*3, ["newS5", "newS6", "newS7"])) + [("oldN2", "oldS6")]
plot_with_rpm_or_hits = "hits"
plot_col_target = "unadapted"  # "total" or "adapted" or "unadapted"

def calc_stdDev_and_stdDevOverMean(values):
    std_dev = stats.stdev(values)
    mean = stats.mean(values)
    try:
        std_dev_over_mean = std_dev / mean
    except ZeroDivisionError:
        std_dev_over_mean = np.Inf
    return std_dev, mean, std_dev_over_mean

for libs_to_plot in libs_combos_to_plot:
    first_lib, second_lib = libs_to_plot
    plot_df = filter_df[[col for col in filter_df.columns if first_lib in col or second_lib in col]]
    plot_cols = []
    for lib in libs_to_plot:
        plot_col_converter = {"total": f"total_gene_{plot_with_rpm_or_hits}_{lib}",
                              "adapted": f"gene_{plot_with_rpm_or_hits}_{lib}_t5+",
                              "unadapted": f"gene_{plot_with_rpm_or_hits}_{lib}_t5-"}
        plot_cols.append(plot_col_converter[plot_col_target])
    metric_col_name = f"combined_{plot_col_target}_{plot_with_rpm_or_hits}_{first_lib}+{second_lib}"
    plot_df[metric_col_name] = plot_df[plot_cols[0]] + plot_df[plot_cols[1]]
    
    plot_df[f"fracAdapted_{first_lib}/{second_lib}"] = (plot_df[f"fraction_adapted_{first_lib}"] / plot_df[f"fraction_adapted_{second_lib}"])
    plot_df[f"log2_fracAdapted_{first_lib}/{second_lib}"] = np.log2(plot_df[f"fracAdapted_{first_lib}/{second_lib}"])
    
    plot_df[f"deltaFracAdapted_{first_lib}-{second_lib}"] = (plot_df[f"fraction_adapted_{first_lib}"] - plot_df[f"fraction_adapted_{second_lib}"])
    
    plot_df[f"fracAdapted_FC_{first_lib}/{second_lib}"] = np.log2((plot_df[f"fraction_adapted_{first_lib}"] / plot_df[f"fraction_adapted_{second_lib}"]))
    
    plot_df[[f"stdDev_fracAdapted_{first_lib}_{second_lib}", f"mean_fracAdapted_{first_lib}_{second_lib}", f"stdDevOverMean_fracAdapted_{first_lib}_{second_lib}"]] = plot_df.apply(lambda row: calc_stdDev_and_stdDevOverMean([row[f"fraction_adapted_{first_lib}"], row[f"fraction_adapted_{second_lib}"]]), axis=1, result_type="expand")
    
    fig = px.scatter(plot_df.reset_index(),
                     x=metric_col_name,
                     # =f"mean_fracAdapted_{first_lib}_{second_lib}",
                     # y=f"stdDev_fracAdapted_{first_lib}_{second_lib}",
                     y=f"deltaFracAdapted_{first_lib}-{second_lib}",
                     color="chr_id",
                     log_x=True,
                     hover_name="gene_name",
                     hover_data=["gene_id", "chr_id"] + [col for col in filter_df.columns if first_lib in col or second_lib in col])
    
    fig.update_layout(template="plotly_white")
    fig.write_html(f"./output_files/{npCommon.get_dt()}_deltaFracAdapted-v-combined{plot_col_target.title()}{plot_with_rpm_or_hits.title()}_{first_lib}-{second_lib}.scatter.html")
    fig.write_image(f"./output_files/{npCommon.get_dt()}_deltaFracAdapted-v-combined{plot_col_target.title()}{plot_with_rpm_or_hits.title()}_{first_lib}-{second_lib}.scatter.png")
    fig.show()
    print(plot_df.shape)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


divide by zero encountered in log2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

(14717, 58)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


divide by zero encountered in log2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

(14717, 40)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


divide by zero encountered in log2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

(14717, 49)


# Plot fraction adapted fold change and Chi2/Fishers p-values

In [29]:
from plotly.subplots import make_subplots

stat_test = "fishers"  # "fishers" or "chi2"

plot_df = filter_df.copy(deep=True)
for first_lib, second_lib in [("newN2", "newS5"),
                              ("newN2", "newS6"),
                              # ("newN2", "newS7"),
                              ("oldN2", "oldS6"),
                              ]:
    plot_df[f"fracAdapted_FC_{first_lib}/{second_lib}"] = np.log2(plot_df[f"fraction_adapted_{first_lib}"] / plot_df[f"fraction_adapted_{second_lib}"])
plot_df = plot_df.reset_index()
plot_df = plot_df[plot_df.chr_id != 'MtDNA']
plot_df = plot_df.replace({-np.inf: -6, np.inf: 6})


fig = make_subplots(rows=1, cols=3,
                    shared_yaxes=False,
                    shared_xaxes=True,
                    vertical_spacing=0.02,
                    horizontal_spacing=0.02,
                    subplot_titles=["Old N2 vs. smg-6",
                                    "New N2 vs. smg-5",
                                    "New N2 vs. smg-6",
                                    # "New N2 vs. smg-7",
                                    ])

fig.add_trace(go.Scatter(
    x=plot_df["fracAdapted_FC_oldN2/oldS6"],
    y=plot_df[f"neg_log10_oldN2_v_oldS6_{stat_test}_p_value"],
    marker_color=plot_df[f"oldN2_v_oldS6_{stat_test}_significant"].replace(
        {False: "black",
         True: "red"}
    ),
    hovertext=plot_df["gene_name"],
    mode="markers",
    name="smg-6"),
              row=1, col=1)

cutoff, num_passed_genes = p_value_cutoff_dict[(("oldN2", "oldS6"), "fishers")]
fig.add_hline(y=-np.log10(cutoff),
              row=1, col=1)
fig.add_annotation(text=f"Bonferroni adjusted<br>p-value = {cutoff:0.3e}",
                   yref="y",
                   y=-np.log10(cutoff),
                   xref="x domain",
                   x=0,
                   yanchor="bottom",
                   showarrow=False,
                   align="left")
fig.add_annotation(text=f"<b>genes w/ > 100 {filter_col_target} {filter_with_rpm_or_hits} = {num_passed_genes}",
                  yref="y domain",
                  y=0,
                  xref="x domain",
                  x=0.5,
                  yanchor="bottom",
                  showarrow=False,
                  align="center",
                  row=1, col=1)

for i, smg in enumerate([5,
                         6,
                         # 7,
                         ]):
    fig.add_trace(go.Scatter(
        x=plot_df[f"fracAdapted_FC_newN2/newS{smg}"],
        y=plot_df[f"neg_log10_newN2_v_newS{smg}_{stat_test}_p_value"],
        marker_color=plot_df[f"newN2_v_newS{smg}_{stat_test}_significant"].replace({False: "black", True: "red"}),
        hovertext=plot_df["gene_name"],
        mode="markers",
        name=f"smg-{smg}"),
                  row=1, col=i+2)
    
    cutoff, num_passed_genes = p_value_cutoff_dict[(("newN2", f"newS{smg}"), "fishers")]
    fig.add_hline(y=-np.log10(cutoff),
              row=1, col=i+2)
    fig.add_annotation(text=f"Bonferroni<br>p-value = {cutoff:0.3e}",
                       yref="y",
                       y=-np.log10(cutoff),
                       xref="x domain",
                       x=0,
                       yanchor="bottom",
                       showarrow=False,
                       align="left",
                       row=1, col=i+2)
    fig.add_annotation(text=f"<b>genes w/ > 100 {filter_col_target} {filter_with_rpm_or_hits} = {num_passed_genes}",
                   yref="y domain",
                   y=0,
                   xref="x domain",
                   x=0.5,
                   yanchor="bottom",
                   showarrow=False,
                   align="center",
                   row=1, col=i+2)



fig.update_layout(template="plotly_white",
                  title=f"<b>FC Frac. Adapted vs. {stat_test.title()} Test P-Values</b>"
                        f"<br>Cumulative {filter_col_target} {filter_with_rpm_or_hits} for Cutoff = {cumulative_min_read_cutoff}")

fig.update_xaxes(tickvals=["-6", "-4", "-2", "0", "2", "4", "6"],
                 ticktext=["-INF", "-4", "-2", "0", "2", "4", "INF"])

fig.update_yaxes(title_text=f"-log<sub>10</sub> {stat_test.title()} Test P-Value",
                 row=1, col=1)
fig.update_xaxes(title_text="Fold Change Fraction Adapted<br>N2 vs. smg")
fig.write_html(f"./output_files/{npCommon.get_dt()}_{stat_test}Pvalues-v-FCfractionAdapted.scatters.html")
fig.show()
plot_df.query("gene_name == 'F19B2.5'").T


divide by zero encountered in log2


divide by zero encountered in log2


divide by zero encountered in log2



Unnamed: 0,1083
chr_id,V
gene_id,WBGene00008944
gene_name,F19B2.5
gene_hits_oldN2_t5+,82.0
gene_rpm_oldN2_t5+,172.959656
...,...
neg_log10_oldN2_v_oldS6_fishers_p_value,24.324599
oldN2_v_oldS6_fishers_significant,True
fracAdapted_FC_newN2/newS5,3.198494
fracAdapted_FC_newN2/newS6,3.584963


# Assessing p-value rank "shape"

In [13]:
fig = make_subplots(rows=4, cols=2,
                    column_widths=[0.8, 0.2],
                    shared_yaxes=False,
                    shared_xaxes=True,
                    vertical_spacing=0.1,
                    horizontal_spacing=0.02,
                    subplot_titles=["Old N2 vs. smg-6", "(zoom)",
                                    "New N2 vs. smg-5", "(zoom)",
                                    "New N2 vs. smg-6", "(zoom)",
                                    "New N2 vs. smg-7", "(zoom)",])

for i, (lib1, lib2) in enumerate([("oldN2", "oldS6"),
                                  ("newN2", "newS5"),
                                  ("newN2", "newS6"),
                                  ("newN2", "newS7")]):
    plt_df = filter_df.sort_values(f"{lib1}_v_{lib2}_fishers_p_value").reset_index()
    p_values = plt_df[f'{lib1}_v_{lib2}_fishers_p_value']
    p_value_ranks = p_values.rank()
    non_1_max_p_value_rank = list(set(plt_df[f'{lib1}_v_{lib2}_fishers_p_value'].rank()))[-2]
    # ^ This value is similar to the number of p-values that I am "really testing"
    
    # Overall plots:
    fig.add_trace(go.Scatter(x=p_value_ranks,
                             y=p_values,
                             hovertext=plt_df['gene_name'],
                             mode="markers",
                             marker=dict(color="black"),
                             name=lib2),
                  row=i+1, col=1)
    fig.add_trace(go.Scatter(y=[(0.05 / non_1_max_p_value_rank), 0.05],
                             x=[1, non_1_max_p_value_rank],
                             mode="lines",
                             marker=dict(color="red"),),
              row=i+1, col=1)
    
    # Zoom plots:
    fig.add_trace(go.Scatter(x=p_value_ranks,
                             y=p_values,
                             hovertext=plt_df['gene_name'],
                             mode="markers",
                             marker=dict(color="black"),
                             name=lib2),
                  row=i+1, col=2)
    fig.add_trace(go.Scatter(y=[(0.05 / non_1_max_p_value_rank), 0.05],
                             x=[1, non_1_max_p_value_rank],
                             mode="lines",
                             marker=dict(color="red"),),
              row=i+1, col=2)
    
    # Name axes:
    fig.update_yaxes(title_text=f"Fishers P-Value",
                     row=i+1, col=1)
    
    # Set zoom plot axis limits:
    fig.update_layout(**{f"yaxis{(i+1)*2}": dict(range=[0, 0.01]),
                         f"xaxis{(i+1)*2}": dict(range=[0, 25])})
fig.update_xaxes(title_text="Fishers P-Value Rank<br>N2 vs. smg",
                 row=4, col=1)
fig.update_xaxes(title_text="Fishers P-Value Rank<br>N2 vs. smg",
                 row=4, col=2)
fig.update_layout(title=f"Fishers P-value vs Ranking of Fishers P-value",
                  template="plotly_white",
                  showlegend=False)
fig.show()

In [14]:
rank_df = filter_df.copy(deep=True).reset_index()
rank_df = rank_df.query("chr_id != 'MtDNA'")

# This was over-engineered solution, all I really want to check is smg-6 replicable-ity and new smg-5 v new smg-6
lib_combos = [("oldN2", "oldS6"), ("newN2", "newS5"), ("newN2", "newS6"), ("newN2", "newS7")]
for i, libs1 in enumerate(lib_combos):
    libs2 = lib_combos[(i+1) % len(lib_combos)]
    print(libs1, libs2)
    
    # but I can still use this loop for this step:
    lib1, lib2 = libs1
    rank_df[f"{lib1}_v_{lib2}_fishers_p_value_ranked"] = rank_df[f"{lib1}_v_{lib2}_fishers_p_value"].rank()

lib1, lib2, lib3, lib4 = 'newN2', 'newS6', 'oldN2', 'oldS6'

plot_rank_df = rank_df.query(f"cummulative_unadapted_hits_{lib1}_{lib2} > 100").query(f"cummulative_unadapted_hits_{lib3}_{lib4} > 100")
plot_rank_df = plot_rank_df.query(f'{lib1}_v_{lib2}_fishers_p_value < 1').query(f'{lib3}_v_{lib4}_fishers_p_value < 1')

fig = px.scatter(plot_rank_df,
                 x=f'{lib1}_v_{lib2}_fishers_p_value_ranked',
                 y=f'{lib3}_v_{lib4}_fishers_p_value_ranked',
                 hover_name='gene_name',
                 # hover_data=[''],
                 # log_x=True, log_y=True,
                 trendline='ols',
                 )
fig.show()

('oldN2', 'oldS6') ('newN2', 'newS5')
('newN2', 'newS5') ('newN2', 'newS6')
('newN2', 'newS6') ('newN2', 'newS7')
('newN2', 'newS7') ('oldN2', 'oldS6')


UndefinedVariableError: name 'cummulative_unadapted_hits_newN2_newS6' is not defined