In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path
import pandas as pd
import numpy as np
import math

import seaborn as sns

from downstream.signals.signal_r2_permutation_test import collect_paths
from downstream.signals.signal_pca_fit_error_pvalue_permutation_test import process

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [6]:
threads = 8
simulations = 100001

In [3]:
def pvalue_for(normalization, signals_root, simulations, threads):
    output_path = signals_root / "validate.{}.pvalue.{}.csv".format(normalization, simulations)
    print("Results file:", str(output_path), "[exists]" if output_path.exists() else "[not exists]")

    if not output_path.exists():
        print("  calculate:", str(output_path.name))

        # Collect files:
        paths_filter = "_{}.tsv".format(normalization)
        all_paths = collect_paths(signals_root)
        #datatypes = ["H3K4me1", "H3K4me3", "H3K27ac", "H3K36me3", "H3K27me3", "meth"]
        paths = [p for p in all_paths if paths_filter in p.name]
        print("Selected Paths: ", len(paths), "of", len(all_paths))

        # Calc pvalues:
        process(paths, str(output_path), seed=100, simulations=simulations, threads=threads, fdr=True)
        
    return output_path

In [4]:
def fdr_control(normalization, signals_root, fdr=0.05,  simulations = 10000, threads = 4):
    df_path = pvalue_for(normalization, signals_root, simulations, threads)
    df = pd.DataFrame.from_csv(df_path, index_col=None)
    df["loci"] = [Path(f).name for f in df.file]
    df["rnd_better_errors"] = np.round(simulations * df.pvalue) - 1
    df["expected_errors"] = len(df) * df.pvalue
    print("First pvalue failed FDR control:")
    print(*list(zip(df.columns, df.iloc[next(i for i, t in enumerate(df.pvalue_corr >= fdr) if t),:])),
         sep="\n")
    
    fdf = df[df.pvalue_corr < fdr].drop("file", axis=1)
    print()
    print("Loci number:", len(df))
    print("Passes FDR control:", len(fdf))
    print("Expected FP", len(fdf) * fdr)
    
    return  fdf, df, df_path

# Histone Modifications

In [16]:
signals_root = Path("/mnt/stripe/bio/experiments/signal_experiments")

In [63]:
fdr_control("rawq", signals_root, simulations = simulations, threads = threads)[0]

Results file: /mnt/stripe/bio/experiments/signal_experiments/validate.rawq.pvalue.100001.csv [exists]
First pvalue failed FDR control:
('modification', 'meth')
('file', '/mnt/stripe/bio/experiments/signal_experiments/meth/cuffdiff_g_transcript')
('normalization', 'rawq')
('error', 7)
('pvalue', 0.00090998180036399277)
('pvalue_corr', 0.12336753264934702)
('loci', 'cuffdiff_g_transcript')
('rnd_better_errors', 90.0)
('expected_errors', 1.7271454570908582)

Loci number: 1898
Passes FDR control: 12
Expected FP 0.6000000000000001


Unnamed: 0,modification,normalization,error,pvalue,pvalue_corr,loci,rnd_better_errors,expected_errors
0,H3K27ac,rawq,5,1e-05,0.001582,cpg_minavcov10_complex_4outliers.narrow.adjust...,0.0,0.01898
1,H3K27ac,rawq,6,1e-05,0.001582,diffReps_H3K27ac_both,0.0,0.01898
2,H3K27ac,rawq,7,1e-05,0.001582,diffReps_broad_H3K27ac_both,0.0,0.01898
3,H3K27ac,rawq,18,1e-05,0.001582,conservation_regions_0.7_0.8,0.0,0.01898
4,H3K27me3,rawq,1,1e-05,0.001582,diffReps_H3K27me3_both,0.0,0.01898
5,H3K27me3,rawq,0,1e-05,0.001582,diffReps_broad_input_H3K27me3_both,0.0,0.01898
6,H3K27me3,rawq,0,1e-05,0.001582,diff_OD_YD_H3K27me3_zinbra_200_0.01_10_both,0.0,0.01898
7,H3K4me1,rawq,0,1e-05,0.001582,diff_OD_YD_H3K4me1_zinbra_200_1.0E-4_10_both,0.0,0.01898
8,H3K4me3,rawq,0,1e-05,0.001582,diff_OD_YD_H3K4me3_zinbra_200_0.05_5_both,0.0,0.01898
9,H3K4me3,rawq,14,1e-05,0.001582,conservation_regions_0.6_0.7,0.0,0.01898


In [57]:
fdr_control("fripz", signals_root, simulations = simulations, threads = threads)[0]

Results file: /mnt/stripe/bio/experiments/signal_experiments/validate.fripz.pvalue.100001.csv [exists]
First pvalue failed FDR control:
('modification', 'H3K27ac')
('file', '/mnt/stripe/bio/experiments/signal_experiments/H3K27ac/diffReps_H3K27ac_both')
('normalization', 'fripz')
('error', 7)
('pvalue', 0.00030999380012399755)
('pvalue_corr', 0.051148977020459598)
('loci', 'diffReps_H3K27ac_both')
('rnd_better_errors', 30.0)
('expected_errors', 0.46034079318413634)

Loci number: 1485
Passes FDR control: 8
Expected FP 0.4


Unnamed: 0,modification,normalization,error,pvalue,pvalue_corr,loci,rnd_better_errors,expected_errors
0,H3K27ac,fripz,4,1e-05,0.001856,cpg_minavcov10_complex_4outliers.narrow.adjust...,0.0,0.01485
1,H3K27me3,fripz,12,1e-05,0.001856,diffReps_broad_input_H3K27ac_old,0.0,0.01485
2,H3K27me3,fripz,3,1e-05,0.001856,diffReps_broad_H3K27me3_both,0.0,0.01485
3,H3K27me3,fripz,3,1e-05,0.001856,diffReps_H3K27me3_both,0.0,0.01485
4,H3K27me3,fripz,0,1e-05,0.001856,diffReps_broad_input_H3K27me3_both,0.0,0.01485
5,H3K27me3,fripz,0,1e-05,0.001856,diff_OD_YD_H3K27me3_zinbra_200_0.01_10_both,0.0,0.01485
6,H3K4me1,fripz,0,1e-05,0.001856,diff_OD_YD_H3K4me1_zinbra_200_1.0E-4_10_both,0.0,0.01485
7,H3K4me3,fripz,0,1e-05,0.001856,diff_OD_YD_H3K4me3_zinbra_200_0.05_5_both,0.0,0.01485


In [58]:
fdr_control("fripm", signals_root, simulations = simulations, threads = threads)[0]

Results file: /mnt/stripe/bio/experiments/signal_experiments/validate.fripm.pvalue.100001.csv [exists]
First pvalue failed FDR control:
('modification', 'H3K27me3')
('file', '/mnt/stripe/bio/experiments/signal_experiments/H3K27me3/diffReps_H3K27me3_both')
('normalization', 'fripm')
('error', 5)
('pvalue', 0.00090998180036399277)
('pvalue_corr', 0.058753172762631714)
('loci', 'diffReps_H3K27me3_both')
('rnd_better_errors', 90.0)
('expected_errors', 1.3513229735405292)

Loci number: 1485
Passes FDR control: 22
Expected FP 1.1


Unnamed: 0,modification,normalization,error,pvalue,pvalue_corr,loci,rnd_better_errors,expected_errors
0,H3K27ac,fripm,4,1e-05,0.000782,cpg_minavcov10_complex_4outliers.narrow.adjust...,0.0,0.01485
1,H3K27ac,fripm,7,1e-05,0.000782,diff_OD_YD_H3K27ac_zinbra_200_1.0E-4_10_young,0.0,0.01485
2,H3K27ac,fripm,6,1e-05,0.000782,diff_OD_YD_H3K27ac_zinbra_200_1.0E-4_10_both,0.0,0.01485
3,H3K27ac,fripm,6,1e-05,0.000782,diffReps_H3K27ac_old,0.0,0.01485
4,H3K27ac,fripm,6,1e-05,0.000782,diffReps_H3K27ac_both,0.0,0.01485
5,H3K27ac,fripm,5,1e-05,0.000782,diffReps_broad_H3K27ac_both,0.0,0.01485
6,H3K27me3,fripm,2,1e-05,0.000782,diffReps_broad_H3K27me3_young,0.0,0.01485
7,H3K27me3,fripm,0,1e-05,0.000782,diff_OD_YD_H3K27me3_zinbra_200_0.01_10_young,0.0,0.01485
8,H3K27me3,fripm,0,1e-05,0.000782,diff_OD_YD_H3K27me3_zinbra_200_0.01_10_old,0.0,0.01485
9,H3K27me3,fripm,0,1e-05,0.000782,diffReps_broad_input_H3K27me3_both,0.0,0.01485


# Input

In [59]:
input_signals_root = Path("/mnt/stripe/bio/experiments/signal_input/input_unique_tags_bws")

In [60]:
norm = "rawq"
fdf, df, df_patj = fdr_control(norm, input_signals_root, simulations = simulations, threads = threads)
df.drop("file", axis=1).head()

Results file: /mnt/stripe/bio/experiments/signal_input/input_unique_tags_bws/validate.rawq.pvalue.100001.csv [exists]
First pvalue failed FDR control:
('modification', nan)
('file', '/mnt/stripe/bio/experiments/signal_input/input_unique_tags_bws/cgi_cd14_chromhmm18.hg19.14_TssBiv')
('normalization', 'rawq')
('error', 0)
('pvalue', 0.0030099398012039761)
('pvalue_corr', 0.57673961905377269)
('loci', 'cgi_cd14_chromhmm18.hg19.14_TssBiv')
('rnd_better_errors', 300.0)
('expected_errors', 0.89395212095758092)

Loci number: 297
Passes FDR control: 0
Expected FP 0.0


Unnamed: 0,modification,normalization,error,pvalue,pvalue_corr,loci,rnd_better_errors,expected_errors
0,,rawq,0,0.00301,0.57674,cgi_cd14_chromhmm18.hg19.14_TssBiv,300.0,0.893952
1,,rawq,1,0.02218,0.57674,cpg_clock_1000_hg19,2217.0,6.587328
2,,rawq,1,0.0237,0.57674,cgi_cd14_chromhmm18.hg19.15_EnhBiv,2369.0,7.038759
3,,rawq,1,0.029099,0.57674,ucsc_cpgIslandExt.hg19,2909.0,8.642527
4,,rawq,1,0.031199,0.57674,cgi_cd14_chromhmm18.hg19.16_ReprPC,3119.0,9.266215


In [61]:
norm = "rawz"
fdf, df, df_path = fdr_control(norm, input_signals_root, simulations = simulations, threads = threads)
df["loci"] = [Path(f).name for f in df.file]
df.drop("file", axis=1).head()

Results file: /mnt/stripe/bio/experiments/signal_input/input_unique_tags_bws/validate.rawz.pvalue.100001.csv [exists]
First pvalue failed FDR control:
('modification', nan)
('file', '/mnt/stripe/bio/experiments/signal_input/input_unique_tags_bws/ebseq_g_tss[-2000..2000]')
('normalization', 'rawz')
('error', 0)
('pvalue', 0.003309933801323973)
('pvalue_corr', 0.47618047639047212)
('loci', 'ebseq_g_tss[-2000..2000]')
('rnd_better_errors', 330.0)
('expected_errors', 0.98305033899322003)

Loci number: 297
Passes FDR control: 0
Expected FP 0.0


Unnamed: 0,modification,normalization,error,pvalue,pvalue_corr,loci,rnd_better_errors,expected_errors
0,,rawz,0,0.00331,0.47618,ebseq_g_tss[-2000..2000],330.0,0.98305
1,,rawz,0,0.00331,0.47618,diffReps_H3K4me3_young,330.0,0.98305
2,,rawz,0,0.00481,0.47618,cgi_cd14_chromhmm18.hg19.10_EnhA2,480.0,1.428541
3,,rawz,0,0.00901,0.531041,cd14_chromhmm18.hg19.12_ZNF_Rpts,900.0,2.675916
4,,rawz,0,0.00901,0.531041,cd14v2_chromhmm18.hg19.12_ZNF_Rpts,900.0,2.675916


# H3K27me3 + DiffBind scores options

In [2]:
signals_root = Path("/mnt/stripe/bio/experiments/k27me3@dmrs")

In [3]:
csv_paths = [p for p in signals_root.glob("k*_counts.csv")]
[p.name for p in csv_paths]

['k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_READS_FOLD_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_RPKM_FOLD_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_READS_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_READS_MINUS_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_RPKM_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_READS_FULL_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_MINUS_FULL_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_MINUS_EFFECTIVE_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_READS_FULL_CPM_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE_CPM_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_MINUS_FULL_CPM_counts.csv',
 'k27me3@dmrs_dedup_TRUE_f_125_DBA_SCORE_TMM_MINUS_EFFECTIVE_CPM_counts.csv',
 'k27me3@dmrs_dedup_FALSE_f_125_DBA_SCORE_READS_counts.csv',
 'k27me3@dmrs_dedup_FALSE_f_125_DBA_S

In [4]:
def pvalue_for_csv_paths(paths, signals_root, simulations, threads):
    output_path = signals_root / "validate.pvalue.{}.csv".format(simulations)
    print("Results file:", str(output_path), "[exists]" if output_path.exists() else "[not exists]")

    if not output_path.exists():
        print("  calculate:", str(output_path.name))

        # Calc pvalues:
        process(paths, str(output_path), seed=100, simulations=simulations, threads=threads, fdr=True)
        
    return output_path

In [10]:
df_path = pvalue_for_csv_paths(csv_paths, signals_root, simulations, threads)

Results file: /mnt/stripe/bio/experiments/k27me3@dmrs/validate.pvalue.100001.csv [exists]


In [8]:
#! rm /mnt/stripe/bio/experiments/k27me3@dmrs/validate.pvalue.100001.csv

In [18]:
df = pd.DataFrame.from_csv(df_path, index_col=None).drop ("file", axis=1)
df.loc[:, "modification"] = "H3K27me3"
df.normalization = [l.replace("k27me3@dmrs_", "") for l in df.normalization]
df

Unnamed: 0,modification,normalization,error,pvalue,pvalue_corr
0,H3K27me3,dedup_TRUE_f_125_DBA_SCORE_TMM_READS_FULL_coun...,4,1e-05,0.00013
1,H3K27me3,dedup_TRUE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE...,3,1e-05,0.00013
2,H3K27me3,dedup_TRUE_f_125_DBA_SCORE_TMM_READS_FULL_CPM_...,3,1e-05,0.00013
3,H3K27me3,dedup_FALSE_f_150_DBA_SCORE_READS_counts.csv,3,1e-05,0.00013
4,H3K27me3,dedup_TRUE_f_125_DBA_SCORE_READS_counts.csv,2,9e-05,0.000669
5,H3K27me3,dedup_FALSE_f_125_DBA_SCORE_TMM_READS_EFFECTIV...,4,9e-05,0.000669
6,H3K27me3,dedup_TRUE_f_150_DBA_SCORE_READS_counts.csv,3,9e-05,0.000669
7,H3K27me3,dedup_TRUE_f_150_DBA_SCORE_TMM_READS_FULL_coun...,4,0.00017,0.001105
8,H3K27me3,dedup_TRUE_f_150_DBA_SCORE_TMM_READS_FULL_CPM_...,5,0.00025,0.001444
9,H3K27me3,dedup_TRUE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE...,5,0.00033,0.001716


In [29]:
for i, r in df.iterrows():
    print("[{}] {}: {}".format(r.error, r.normalization, r.pvalue))

[4] dedup_TRUE_f_125_DBA_SCORE_TMM_READS_FULL_counts.csv: 9.99980000399992e-06
[3] dedup_TRUE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE_counts.csv: 9.99980000399992e-06
[3] dedup_TRUE_f_125_DBA_SCORE_TMM_READS_FULL_CPM_counts.csv: 9.99980000399992e-06
[3] dedup_FALSE_f_150_DBA_SCORE_READS_counts.csv: 9.99980000399992e-06
[2] dedup_TRUE_f_125_DBA_SCORE_READS_counts.csv: 8.999820003599928e-05
[4] dedup_FALSE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE_counts.csv: 8.999820003599928e-05
[3] dedup_TRUE_f_150_DBA_SCORE_READS_counts.csv: 8.999820003599928e-05
[4] dedup_TRUE_f_150_DBA_SCORE_TMM_READS_FULL_counts.csv: 0.00016999660006799863
[5] dedup_TRUE_f_150_DBA_SCORE_TMM_READS_FULL_CPM_counts.csv: 0.000249995000099998
[5] dedup_TRUE_f_125_DBA_SCORE_TMM_READS_EFFECTIVE_CPM_counts.csv: 0.00032999340013199736
[5] dedup_FALSE_f_125_DBA_SCORE_READS_counts.csv: 0.0004899902001959961
[5] dedup_TRUE_f_150_DBA_SCORE_TMM_MINUS_FULL_CPM_counts.csv: 0.0006499870002599947
[5] dedup_TRUE_f_125_DBA_SCORE_TMM_MINUS_FULL