In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path
import pandas as pd
import numpy as np
import math

import seaborn as sns

from downstream.signals.signal_r2_permutation_test import collect_paths
from downstream.signals.signal_pca_fit_error_pvalue_permutation_test import process

In [None]:
threads = 8
simulations = 5000 #100000

In [None]:
def pvalue_for(normalization, signals_root, simulations, threads):
    output_path = signals_root / "validate.{}.pvalue.{}.csv".format(normalization, simulations)
    print("Results file:", str(output_path), "[exists]" if output_path.exists() else "[not exists]")

    if not output_path.exists():
        print("  calculate:", str(output_path.name))

        # Collect files:
        paths_filter = "_{}.tsv".format(normalization)
        all_paths = collect_paths(signals_root)
        #datatypes = ["H3K4me1", "H3K4me3", "H3K27ac", "H3K36me3", "H3K27me3", "meth"]
        paths = [p for p in all_paths if paths_filter in p.name]
        print("Selected Paths: ", len(paths), "of", len(all_paths))

        # Calc pvalues:
        process(paths, str(output_path), seed=100, simulations=simulations, threads=threads, fdr=True)
        
    return output_path

In [None]:
def fdr_control(normalization, signals_root, fdr=0.05,  simulations = 10000, threads = 4):
    df_path = pvalue_for(normalization, signals_root, simulations, threads)
    df = pd.DataFrame.from_csv(df_path, index_col=None)
    df["loci"] = [Path(f).name for f in df.file]
    df["rnd_better_errors"] = np.round(simulations * df.pvalue) - 1
    df["expected_errors"] = len(df) * df.pvalue
    print("First pvalue failed FDR control:")
    print(*list(zip(df.columns, df.iloc[next(i for i, t in enumerate(df.pvalue_corr >= fdr) if t),:])),
         sep="\n")
    
    fdf = df[df.pvalue_corr < fdr].drop("file", axis=1)
    print()
    print("Loci number:", len(df))
    print("Passes FDR control:", len(fdf))
    print("Expected FP", len(fdf) * fdr)
    
    return  fdf, df, df_path

# Histone Modifications

In [None]:
signals_root = Path("/mnt/stripe/bio/experiments/signal_experiment")

In [None]:
fdr_control("rawq", signals_root, simulations = simulations, threads = threads)[0]

In [None]:
fdr_control("fripz", signals_root, simulations = simulations, threads = threads)[0]

In [None]:
fdr_control("fripm", signals_root, simulations = simulations, threads = threads)[0]

In [None]:
fdr_control("manorm", signals_root, simulations = simulations, threads = threads)[0]

In [None]:
fdr_control("diffbind_tmm_minus_full", signals_root, simulations = simulations, threads = threads)[0]

In [None]:
fdr_control("diffbind_tmm_reads_effective_cpm", signals_root, simulations = simulations, threads = threads)[0]

# Input

In [None]:
input_signals_root = Path("/mnt/stripe/bio/experiments/signal_input/input_unique_tags_bws")

In [None]:
norm = "rawq"
fdf, df, df_patj = fdr_control(norm, input_signals_root, simulations = simulations, threads = threads)
df.drop("file", axis=1).head()

In [None]:
norm = "rawz"
fdf, df, df_path = fdr_control(norm, input_signals_root, simulations = simulations, threads = threads)
df["loci"] = [Path(f).name for f in df.file]
df.drop("file", axis=1).head()

# H3K27me3 + DiffBind scores options

In [None]:
signals_root = Path("/mnt/stripe/bio/experiments/k27me3@dmrs")

In [None]:
csv_paths = [p for p in signals_root.glob("k*_counts.csv")]
[p.name for p in csv_paths]

In [None]:
def pvalue_for_csv_paths(paths, signals_root, simulations, threads):
    output_path = signals_root / "validate.pvalue.{}.csv".format(simulations)
    print("Results file:", str(output_path), "[exists]" if output_path.exists() else "[not exists]")

    if not output_path.exists():
        print("  calculate:", str(output_path.name))

        # Calc pvalues:
        process(paths, str(output_path), seed=100, simulations=simulations, threads=threads, fdr=True)
        
    return output_path

In [None]:
df_path = pvalue_for_csv_paths(csv_paths, signals_root, simulations, threads)

In [None]:
#! rm /mnt/stripe/bio/experiments/k27me3@dmrs/validate.pvalue.100001.csv

In [None]:
df = pd.DataFrame.from_csv(df_path, index_col=None).drop ("file", axis=1)
df.loc[:, "modification"] = "H3K27me3"
df.normalization = [l.replace("k27me3@dmrs_", "") for l in df.normalization]
df

In [None]:
for i, r in df.iterrows():
    print("[{}] {}: {}".format(r.error, r.normalization, r.pvalue))