In [None]:
from pathlib import Path
from itertools import chain
import subprocess 
import os
from collections import defaultdict
import itertools

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
from IPython.display import display
import pybedtools
import seaborn as sns

In [None]:
# results_dir = Path("/mnt/stripe/bio/experiments/aging/loci_of_interest.tables")
# sorted_root = Path("/mnt/stripe/bio/experiments/aging/loci.sorted")
# THREADS_N = 32

results_dir = Path("/Volumes/BigData/bio/experiments/aging/loci_of_interest.tables")
sorted_root = Path("/Volumes/BigData/bio/experiments/aging/loci.sorted")
THREADS_N = 8

results_dir.mkdir(exist_ok=True)

# Cleanup

In [None]:
#pybedtools.set_tempdir("/tmp")
pybedtools.cleanup()
# !rm {sorted_root}
# !rm {results_dir}

# Known annotations

In [None]:
# loci_root = Path("/mnt/stripe/bio/raw-data/aging/loci_of_interest")
# golden_peaks_root = Path("/mnt/stripe/bio/experiments/aging/peak_calling")
# zinbra_peaks_root = Path("/mnt/stripe/bio/experiments/configs/Y20O20/peaks")

loci_root = Path("/Volumes/BigData/bio/raw-data/aging/loci_of_interest")
golden_peaks_root = Path("/Volumes/BigData/bio/experiments/aging/peak_calling") # *.*Peak
zinbra_peaks_root = Path("/Volumes/BigData/bio/experiments/configs/Y20O20/peaks") # *.bed

diff_chip_root = "/mnt/stripe/bio/raw-data/aging/chipseq_diff_loci"
diff_chip_root = loci_root / "chipseq_diff_loci"

signal_root = Path("/mnt/stripe/bio/experiments/signal")

chromhmm_root = loci_root / "chromhmm"

In [None]:
!ls {loci_root}

In [None]:
!ls {zinbra_peaks_root}

In [None]:
!ls {golden_peaks_root}

In [None]:
!ls {diff_chip_root}

## ChromHMM

In [None]:
chromhmm_paths = list(chromhmm_root.glob('*.bed'))
chromhmm_paths.sort(key=lambda p: int(p.name.split(".")[2].split("_")[0]))

CHROMHMM_ST_MAP = {
    "1_TssA": "Active TSS",
    "2_TssFlnk": "Flanking TSS",
    "3_TssFlnkU": "Flanking TSS Upstream",
    "4_TssFlnkD": "Flanking TSS Downstream",
    "5_Tx": "Strong transcription",
    "6_TxWk": "Weak transcription",
    "7_EnhG1": "Genic enhancer1",
    "8_EnhG2": "Genic enhancer2",
    "9_EnhA1": "Active Enhancer 1",
    "10_EnhA2": "Active Enhancer 2",
    "11_EnhWk": "Weak Enhancer",
    "12_ZNF_Rpts": "ZNF genes & repeats",
    "13_Het": "Heterochromatin",
    "14_TssBiv": "Bivalent/Poised TSS",
    "15_EnhBiv": "Bivalent Enhancer",
    "16_ReprPC": "Repressed PolyComb",
    "17_ReprPCWk": "Weak Repressed PolyComb",
    "18_Quies": "Quiescent/Low",
}

def chromhmm_state_descr(s):
    chunks = s.split(".")
    if len(chunks) <= 2:
        return s
    return CHROMHMM_ST_MAP.get(chunks[2], s)

for i, p in enumerate(chromhmm_paths):
    print(chromhmm_state_descr(p.name), "->", p)

## Basic Loci

Cannot include all files from dir, because list is too big and heatmap becomes unreadable. Let's keep curated list
of loci by rules:
* root folder top level *.bed files
* subfoldes: "enchancers", "tfs", "regulatory", "weak_consensus", "zinbra_consensus"

In [None]:
loci_paths = [p for p in loci_root.glob('*.bed')]
for folder in ["enchancers", "tfs", "regulatory", "weak_consensus", "zinbra_consensus"]:
    loci_paths.extend([p for p in (loci_root / folder).glob('**/*.bed')])
loci_paths = sorted(loci_paths)
loci_paths

## Diff-Chipseq loci

In [None]:
diff_chip_paths = [p for p in diff_chip_root.glob('**/*.bed')]
diff_chip_paths

## Peaks

In [None]:
def donor_order_id(path):
    chunks = path.name.split('_')
    cands = list(filter(lambda s: len(s) > 2 and (s.startswith("OD") or s.startswith("YD")), chunks))
    if len(cands) > 0:
        donor_id = cands[0]
        if donor_id[2] != "S":
            return (donor_id[:2], int(donor_id[2:]))

    return (path.name, 0)
    

def collect_peaks(peaks_roots):
    result = {}
    for peaks_root in [x for x in peaks_roots.iterdir() if x.is_dir()]:
        print("Peaks:", peaks_root)

        peaks = list(chain(peaks_root.glob("**/*.bed"), peaks_root.glob("**/*.*Peak")))
        # e.g. 
        # * OD_OD14_H3K27ac_hg19_1.0E-6_peaks.bed
        # * OD8_k27ac_hg19_broad_peaks.broadPeak
        # * zinbra_weak_consensus.bed
        peaks.sort(key=donor_order_id)
        print(len(peaks))    
        print(*[str(p) for p in peaks], sep="\n")
        result[peaks_root.name] = peaks
    return result

In [None]:
golden_peaks_by_histmod = collect_peaks(golden_peaks_root)

In [None]:
zinbra_peaks_by_histmod = collect_peaks(zinbra_peaks_root)

## Consensus

In [None]:
zinbra_conensus_paths = [p for p in (loci_root / "zinbra_consensus").glob('*.bed')]
zinbra_conensus_paths

# Alternative:
# zinbra_peaks_by_histmod = collect_peaks(zinbra_peaks_root)
# consensus_peaks = []
# for mod, peaks in zinbra_peaks_by_histmod.items():
#     consensus_peaks.extend([p for p in peaks if "consensus" in p.name])

In [None]:
golden_conensus_paths = [p for p in (loci_root / "golden_consensus").glob('*.bed')]
golden_conensus_paths

# Alternative:
# golden_peaks_by_histmod = collect_peaks(golden_peaks_root)
# consensus_peaks = []
# for mod, peaks in golden_peaks_by_histmod.items():
#     consensus_peaks.extend([p for p in peaks if "consensus" in p.name])

## Summary

In [None]:
all_loci = loci_paths + chromhmm_paths

# Code

In [None]:
!which bedtools

In [None]:
# bedtrace.py
def run(commands, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
    """Launches pipe of commands given stdin and final stdout, stderr"""
    processes = []
    _stdin = stdin
    for i, cmd in enumerate(commands):
        if i < len(commands) - 1:
            _stdout = subprocess.PIPE
        else:
            _stdout = stdout
            
        p = subprocess.Popen(cmd, stdin=_stdin, stdout=_stdout,
                             stderr=stderr)
        processes.append(p)
        _stdin = p.stdout

    for i in range(0, len(processes)):
        if i < len(processes) - 1:
            # Allow p1 to receive a SIGPIPE if p2 exits.
            processes[i].stdout.close()
        else:
            return processes[i].communicate()

In [None]:
import shutil
def as_sorted(p: Path, root: Path, sorted_root: Path):
    sorted_p = sorted_root / p.relative_to(root)
    sorted_p = sorted_p.parent / (sorted_p.stem + ".sorted.bed")

    if not sorted_p.exists():
        sorted_p.parent.mkdir(exist_ok=True, parents=True)
        
        # Do not resort file if already sorted:
        stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(p)],))[1]
        is_sorted = (len(stderr) == 0)
        
        if not is_sorted:
            print("Sorting: ", str(p))
            # By some reason BedTool.sort() fails to sort cds.csv
            # bt.sort().saveas(sorted_p)
            #stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(sorted_p)],))[1]
            #assert len(stderr) == 0, "Expected to be sorted: {}\nError:\n{}".format(sorted_p, stderr)
            with open(str(sorted_p), "w") as f:
                run((["sort", "-k1,1", "-k2,2n", str(p)],), stdout=f)
            print("  [Done]", str(sorted_p))
        else:   
            # just copy file
            shutil.copyfile(str(p), str(sorted_p))
        
        
    return sorted_p

In [None]:
# def as_sorted_bedtool(p: Path, root: Path, sorted_root: Path):
#     sorted_p = sorted_root / p.relative_to(root)
#     sorted_p = sorted_p.parent / (sorted_p.stem + ".sorted.bed")

#     if not sorted_p.exists():
#         sorted_p.parent.mkdir(exist_ok=True, parents=True)
        
#         # Do not resort file if already sorted:
#         stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(p)],))[1]
#         is_sorted = (len(stderr) == 0)
        
#         bt = pybedtools.bedtool.BedTool(str(p))
#         if not is_sorted:
#             print("Sorting: ", str(p))
#             # By some reason BedTool.sort() fails to sort cds.csv
#             # bt.sort().saveas(sorted_p)
#             #stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(sorted_p)],))[1]
#             #assert len(stderr) == 0, "Expected to be sorted: {}\nError:\n{}".format(sorted_p, stderr)
#             with open(str(sorted_p), "w") as f:
#                 run((["sort", "-k1,1", "-k2,2n", str(p)],), stdout=f)
#             print("  [Done]", str(sorted_p))
#         else:   
#             # just copy file
#             bt.saveas(str(sorted_p))
#         del bt  # Too many open files issue
        
#     return pybedtools.bedtool.BedTool(str(sorted_p))

In [None]:
from multiprocessing import Pool, TimeoutError

def run_bedtools_uniq_wc(ij, a, b):
    output = run((["bedtools", "intersect", "-a", str(a),
                   "-b", str(b), "-wa"],
                  ["uniq"], ["wc", "-l"]))
    return (ij, int(output[0].decode().strip()))

# def run_bedtools_uniq_wc(ij, a: pybedtools.BedTool, b: pybedtools.BedTool):
#     # a = as_sorted_bedtool(a)
#     # b = as_sorted_bedtool(b)
#     c = a.intersect(b, wa=True)
#     output = run((["cat", c.fn], ["uniq"], ["wc", "-l"]))
#     del c  # To many open files issues
#     return (ij, int(output[0].decode().strip()))

def run_bedtools_jaccard(ij, a, b):
    output = run((["~/work/washu/bed/jaccard.sh", str(a), str(b)]))
    stdout = output[0].decode().strip()
    return (ij, float(stdout))

# def run_bedtools_jaccard(ij, a, b):
#     #bed tools jaccard not symmetrix
#     output = run((["bedtools", "jaccard", "-a", str(a),
#                    "-b", str(b)],
#                   ["cut", "-f", "3"]))
#     stdout = output[0].decode().strip()
#     lines = stdout.split("\n")
#     assert len(lines) == 2, lines
#     assert lines[0] == "jaccard"
#     return (ij, float(lines[1]))

# def run_bedtools_jaccard(ij, a: pybedtools.BedTool, b: pybedtools.BedTool):
#     # a = as_sorted_bedtool(a)
#     # b = as_sorted_bedtool(b)
#     return (ij, a.jaccard(b)["jaccard"])

def calc_intersection_table(a_paths, b_paths, path_to_sorted,
                            threads=4, timeout_hours=10, jaccard=False):   
    path_pairs = []
    for i, a in enumerate(a_paths, 0):
        for j, b in enumerate(b_paths, 1):
            path_pairs.append(((i,j), path_to_sorted[a], path_to_sorted[b]))

    metric = run_bedtools_jaccard if jaccard else run_bedtools_uniq_wc
    pool = Pool(processes=threads) 
    multiple_results = [pool.apply_async(metric, 
                                         (ij, a, b)) for ij, a, b in path_pairs]
    values = [res.get(timeout=3600*timeout_hours) for res in multiple_results]
    
    x = np.zeros((len(a_paths), 1 + len(b_paths)), np.float32)
    for (i,j), value in values:
        x[i, j] = value
    
    for i, a in enumerate(a_paths, 0):
        output = run((["cat", str(a)],["wc", "-l"],))
        x[i, 0] = int(output[0].decode().strip())
               
    df = pd.DataFrame(x,
                      index=[f.name for f in a_paths],
                      columns=["total"] + [f.name for f in b_paths])
    return df

In [None]:
def plot_heatmap(title, df, path=None, autoscale=False, label_fun=None, figsize=(10,10),
                 col_cluster=False, row_cluster=False):
    if autoscale:
        vmin, vmax = None, None
    else:
        vmin, vmax = 0, 1
        
    if label_fun:
        df = df.copy()
        df.columns = [label_fun(s) for s in df.columns]
        df.index = [label_fun(s) for s in df.index]
        
    g = sns.clustermap(df,
                       col_cluster=col_cluster, row_cluster=row_cluster,
                       figsize=figsize, cmap="rainbow",
                       metric="chebyshev",
                       vmin=vmin, vmax=vmax, robust=True) #robust=True: ignore color outliers
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)

    plt.title(title)
    if path is None:        
        plt.show()
    else:
        pp.savefig()

In [None]:
def load_intersection_table(beds, loci, path_to_bt, result_path, threads=4, jaccard=False):
    if result_path.exists():
        df = pd.DataFrame.from_csv(result_path)
        print("Loaded: ", result_path)
    else:
        print("Calculating: ", result_path)
        df = calc_intersection_table(beds, loci, path_to_bt, threads=threads, jaccard=jaccard) 
        result_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(str(result_path))
        print("  Saved: ", result_path)
        
    return df

In [None]:
def normalize(df):
    return df.divide(df["total"], axis=0).drop("total", axis=1)

In [None]:
def process_intersection(beds, loci, path_to_bt, results_dir, tag,
                         figsize=(10,10), col_cluster=False, row_cluster=True):
    df_bl = load_intersection_table(beds, loci, path_to_bt, 
                                    results_dir / "{}_bl.csv".format(tag), threads=THREADS_N)
    display(df_bl.head(3))
    
    df_lb = load_intersection_table(loci, beds, path_to_bt,
                                    results_dir / "{}_lb.csv".format(tag), threads=THREADS_N)
    display(df_lb.head(3))
    
    df_n_bl = normalize(df_bl)
    display(df_n_bl.head(3))

    df_n_lb = normalize(df_lb).T
    display(df_n_lb.head(3))

    df_jaccard = load_intersection_table(beds, loci, path_to_bt, 
                                         results_dir / "{}_js.csv".format(tag), threads=THREADS_N,
                                         jaccard = True)
    df_jaccard = df_jaccard.drop("total", axis=1)
    display(df_jaccard.head(3))
    
    plot_heatmap("Metrics: # intervals from row file intersecting any interval from column file",
                 df_n_bl, autoscale=False, label_fun=chromhmm_state_descr, figsize=figsize,
                 col_cluster=col_cluster, row_cluster=row_cluster)
    plot_heatmap("Metrics: # intervals from col file intersecting any interval from row file",
                 df_n_lb, autoscale=False, label_fun=chromhmm_state_descr, figsize=figsize,
                 col_cluster=col_cluster, row_cluster=row_cluster)
    plot_heatmap("Metrics: Geometric mean for intersectiong intervals",
                 np.sqrt(df_n_bl*df_n_lb), autoscale=False, label_fun=chromhmm_state_descr, figsize=figsize,
                 col_cluster=col_cluster, row_cluster=row_cluster)
    plot_heatmap("Metrics: Jaccard",
                 df_jaccard, autoscale=True, label_fun=chromhmm_state_descr, figsize=figsize,
                 col_cluster=col_cluster, row_cluster=row_cluster)

# TMP

In [None]:
print("Ensure files sorted...")
tmp_loci_paths = loci_paths[0:6]
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in tmp_loci_paths}
print("[Done]")

process_intersection(tmp_loci_paths, tmp_loci_paths, mapping, results_dir, "tmp0_loci.csv", figsize=(10,10))

# Loci vs Loci

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in loci_paths}
print("[Done]")

process_intersection(loci_paths, loci_paths, mapping, results_dir, "loci", figsize=(15,15))

# Loci vs ChromHMM

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in loci_paths}
mapping.update({p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in chromhmm_paths})
print("[Done]")

process_intersection(loci_paths, chromhmm_paths, mapping, results_dir, "loci_chromhmm", figsize=(8, 15))

# ChromHMM vs ChromHMM

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in chromhmm_paths}
print("[Done]")
process_intersection(chromhmm_paths, chromhmm_paths, mapping, results_dir, "chromhmm", figsize=(8,8))

# Diff chipseq vs Loci

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in diff_chip_paths}
mapping.update({p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in chromhmm_paths})
print("[Done]")
process_intersection(diff_chip_paths, chromhmm_paths, mapping, results_dir, "diff_chip_chromhmm", figsize=(8,8))

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in diff_chip_paths}
mapping.update({p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in loci_paths})
print("[Done]")
process_intersection(diff_chip_paths, loci_paths, mapping, results_dir, "diff_chip_loci", figsize=(8,8))

In [None]:
zinbra_YO_consensus_paths = list((loci_root / "zinbra_YO_consensus").glob("*.bed"))
print("Ensure files sorted...")
mapping = {p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in diff_chip_paths}
mapping.update({p:as_sorted(p, loci_root, sorted_root / "loci_of_interest") for p in zinbra_YO_consensus_paths})
print("[Done]")
process_intersection(diff_chip_paths, zinbra_YO_consensus_paths, mapping, results_dir, "diff_chip_zinbra_YO_consensus",
                     figsize=(8,8), row_cluster=False, col_cluster=True)

# Consensus vs Consensus

In [None]:
print("Ensure files sorted...")
mapping = {}
for p in (zinbra_conensus_paths + golden_conensus_paths):
    mapping[p] = as_sorted(p, loci_root, sorted_root)
print("[Done]")

process_intersection(consensus_peaks, all_loci, mapping, results_dir, "consensus", figsize=(15,4))

# Zinbra vs Loci

## Consensus peaks

In [None]:
print("Ensure files sorted...")
mapping = {}
for p in zinbra_conensus_paths:
    mapping[p] = as_sorted(p, zinbra_peaks_root, sorted_root)
for p in all_loci:
    mapping[p] = as_sorted(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

process_intersection(consensus_peaks, all_loci, mapping, results_dir, "zinbra_consensus_vs_loci", figsize=(15,4))

## All Hist mods:

In [None]:
print("Ensure files sorted...")
mapping = {}
for mod, peaks in zinbra_peaks_by_histmod.items():
    for p in peaks:
        mapping[p] = as_sorted(p, zinbra_peaks_root, sorted_root / "zinbra")
for p in all_loci:
    mapping[p] = as_sorted(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

for mod, peaks in zinbra_peaks_by_histmod.items():
    process_intersection(peaks, all_loci, mapping, results_dir, "zinbra_{}_vs_loci".format(mod), figsize=(17,10))

# Macs vs Loci

## Conensus peaks:

In [None]:
print("Ensure files sorted...")
mapping = {}
for p in golden_conensus_paths:
    mapping[p] = as_sorted(p, golden_peaks_root, sorted_root)
for p in all_loci:
    mapping[p] = as_sorted(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

process_intersection(consensus_peaks, all_loci, mapping, results_dir, "golden_consensus_vs_loci", figsize=(15,4))

## All Hist mods:

In [None]:
print("Ensure files sorted...")
mapping = {}
for mod, peaks in golden_peaks_by_histmod.items():
    for p in peaks:
        mapping[p] = as_sorted(p, golden_peaks_root, sorted_root / "golden")
for p in all_loci:
    mapping[p] = as_sorted(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

for mod, peaks in golden_peaks_by_histmod.items():
    process_intersection(peaks, all_loci, mapping, results_dir, "golden_{}_vs_loci".format(mod), figsize=(17,10))


# TODO: RNA-diff

In [None]:
loci_root / "rna_diff"

# TODO: Pathway

In [None]:
loci_root / "pathway"

# Signal (coverage) vs loci

In [None]:
signal_root

In [None]:
signal_dfs_by_datatype = {}
signal_dfs_by_loci = {}

series_by_loci = defaultdict(list)
data_type_paths = [p for p in signal_root.iterdir() if p.is_dir()]
for i, data_type_path in enumerate(data_type_paths, 1):
    data_type = data_type_path.name
    print("[{}/{}] Processing: {}".format(i, len(data_type_paths), data_type))
    
    for norm in ["raw", "rpkm", "rpm"]:
        print("  Normalization:", norm)
        series_by_datatype = []
        
        # TODO: load from results dir?
        for loci_path in (p for p in data_type_path.iterdir() if p.is_dir()):
            loci = loci_path.name
            files = [p for p in loci_path.glob("**/*_{}_data.csv".format(norm))]
            
            assert len(files) <= 1, "{}@{} [{}] Expected one file, but was {}: {}".format(
                data_type, loci, norm, len(files), files
            )
            if not len(files):
                continue
            
            df = pd.DataFrame.from_csv(files[0] , header=None)
            series = df.iloc[:,0]
            series.name = loci
            
            series_by_datatype.append(series) 
            
            series2 = series.copy()
            series2.name = data_type
            series_by_loci[(loci, norm)].append(series2)
            
        # by data type:    
        df = pd.DataFrame(series_by_datatype, )
        #df.index = [f.stem for f in itertools.islice(files, 10)]
        df.to_csv(str(results_dir / "signal_{}_{}".format(data_type, norm)))
        signal_dfs_by_datatype[(data_type, norm)] = df

for (loci, norm), series in series_by_loci.items():
    df = pd.DataFrame(series, )
    df.to_csv(str(results_dir / "signal_{}_{}".format(loci, norm)))
    signal_dfs_by_loci[(loci, norm)] = df
    

In [None]:
signal_dfs_by_datatype[("H3K4me1", "rpkm")].head()

In [None]:
signal_dfs_by_loci[("washu_german_rrbs_filtered_dmrs_all_10.hg19", "rpkm")].head()

## Plots

In [None]:
def plot_donors_heatmap(title, df, path=None, autoscale=False, 
                        label_fun=None, figsize=(10,10),
                        donors_difference=True,
                        col_cluster=False, row_cluster=False):
    if autoscale:
        vmin, vmax = None, None
    else:
        vmin, vmax = 0, 1
        
    if label_fun:
        df = df.copy()
        df.columns = [label_fun(s) for s in df.columns]
        df.index = [label_fun(s) for s in df.index]
        
    donors_colors = ["g" if d.lower().startswith("od") else ("b" if d.lower().startswith("YD") else "b")
                     for d in df.index]
    row_colors = pd.Series(data=donors_colors, index=df.index, name="age")
            
    g = sns.clustermap(df,
                       col_cluster=col_cluster, row_cluster=row_cluster,
                       figsize=figsize, cmap="rainbow",
                       metric="chebyshev",
                       standard_scale = 1 if donors_difference else 0,  #0 (rows) or 1 (columns)
                       vmin=vmin, vmax=vmax,
                       row_colors=row_colors,
                       robust=True) #robust=True: ignore color outliers
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)

    plt.title(title)
    if path is None:        
        plt.show()
    else:
        pp.savefig()
        
def plot_signal_heatmap(tag, metric, signal_dfs, *args,
                        col_filter_fun=None,
                        **kw):
    df = signal_dfs[(tag, metric)].T
        
    # let's sort by index, not just lexicographically, but in human readable order, e.g. OD2 shoud be before OD10
    def inner_donor_order_id(name):
        assert (len(name) > 2 and (name.startswith("od") or name.startswith("yd")))
        return (name[:2], int(name[2:]))

    df = df.loc[sorted(df.index.tolist(), key=inner_donor_order_id), :]
    
    if col_filter_fun:
        df = df.loc[:, [c for c in df.columns if col_filter_fun(c)]]
    plot_donors_heatmap("[{}]: {}".format(metric, tag), df, *args, **kw)

In [None]:
{k for k,v in signal_dfs_by_loci.keys() if not k.startswith("R")}

### All signal @ CGI

In [None]:
for norm in ["raw", "rpkm", "rpm"]:
    plot_signal_heatmap("ucsc_cpgIslandExt.hg19", norm, signal_dfs_by_loci, 
                        #col_filter_fun=lambda x: x == "meth",
                        donors_difference=True, row_cluster=False, col_cluster=False)

### All signal @ (DMR, 14_TssBiv, 15_Enh_Biv)

In [None]:
for loci in ['cd14_chromhmm.hg19.14_TssBiv', 'cd14_chromhmm.hg19.15_EnhBiv', "washu_german_rrbs_filtered_dmrs_all_10.hg19"]:
    for norm in ["raw", "rpkm", "rpm"]:
        plot_signal_heatmap(loci, norm, signal_dfs_by_loci, 
                            col_filter_fun=lambda x: x == "H3K4me1",
                            donors_difference=True, row_cluster=True, col_cluster=False)

### H3K3me1 signal @ loci

In [None]:
plot_signal_heatmap("H3K4me1", "rpkm", signal_dfs_by_datatype, 
                    col_filter_fun=lambda loci: not loci.startswith("R-HSA"),
                    donors_difference=True, row_cluster=False, col_cluster=True)

### Every data type @ ChromHMM

In [None]:
for norm in ["raw", "rpkm", "rpm"]:
    for histmod in {k for k,v in signal_dfs_by_datatype.keys()}:
        plot_signal_heatmap(histmod, norm, signal_dfs_by_datatype, 
                            col_filter_fun=lambda loci: loci.startswith("ch14_chromhmm"),
                            donors_difference=True, row_cluster=False, col_cluster=True)

## Stat testing

In [None]:
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

signal_pvalues = defaultdict(list)
missed_files = []
ha = "two-sided" # 'less', 'two-sided', or 'greater'
data_type_paths = [p for p in signal_root.iterdir() if p.is_dir()]
for i, data_type_path in enumerate(data_type_paths, 1):
    data_type = data_type_path.name
    print("\n[{}/{}] Processing: {}".format(i, len(data_type_paths), data_type))
    
    for j, loci_path in enumerate(p for p in data_type_path.iterdir() if p.is_dir()):
        loci = loci_path.name
        print(".", end="")

        pvalues = {}
        signal_normalizations = ["raw", "rpkm", "rpm"]
        for norm in signal_normalizations:
            files = [p for p in loci_path.glob("**/*_{}_data.csv".format(norm))]
            
            assert len(files) <= 1, "{}@{} [{}] Expected one file, but was {}: {}".format(
                data_type, loci, norm, len(files), files
            )
            if not len(files):
                missed_files.append("{}@{} [{}]".format(data_type, loci, norm))
                continue
            
            df = pd.DataFrame.from_csv(files[0] , header=None)
            df_ods = df.loc[[d for d in df.index if d.startswith("o")],:]
            df_yds = df.loc[[d for d in df.index if d.startswith("y")],:]
            pvalue = mannwhitneyu(df_ods.iloc[:,0], df_yds.iloc[:,0],
                                  alternative=ha).pvalue
            pvalues[norm] = pvalue

        signal_pvalues["name"].append("{}@{}".format(data_type, loci))    
        for norm in signal_normalizations:
            signal_pvalues[norm].append(pvalues.get(norm, 1.0))
        if j > 50:
            break
            
print("Missed files: ", len(missed_files))
print("  first 10:", *missed_files[0:10])
df = pd.DataFrame.from_dict(signal_pvalues)
df.head()

In [None]:
# see: http://www.statsmodels.org/dev/_modules/statsmodels/stats/multitest.html
df_fdr_bh = df.copy()
for c in (c for c in df.columns if c != "name"):
    _reject, pvals_corrected, *_ = multipletests(pvals=df.loc[:, c], 
                                                 alpha=0.05, method="fdr_bh")
    df_fdr_bh[c] = pvals_corrected
    
df_fdr_bh["min"] = df_fdr_bh.min(axis=1)
df_fdr_bh_sorted = df_fdr_bh.sort_values(by="min")
df_fdr_bh_005 = df_fdr_bh_sorted[df_fdr_bh_sorted["min"] < 0.05]
print("Passing FDR 0.05 by any metric:", len(df_fdr_bh_005))
df_fdr_bh_005.head()

# TODO

heatmaps

* Have:
    * loci vs loci
    * loci vs ChromHMM
    * Hist.mod consensus vs loci,chromHMM
    * Peaks in hist.mod every donor (OD*, YD*) vs loci,chromHMM
    * consensus vs consensus
    * chipseq diff (Y-O) vs loci,chromHMM
    * ([Y/O]x[Hist mod] consensus) vs loci,chromHMM 

* Todo:
  * raw i-th donor(OD*, YD*) coverage vs loci,chromHMM
  * 
  * 
  * 
  * 


todo2:

4. why tf pipeline fails?