In [None]:
from pathlib import Path
from itertools import chain
import subprocess 
import os

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
from IPython.display import display
import pybedtools

In [None]:
#results_dir = Path("/mnt/stripe/bio/experiments/aging/loci_of_interest.tables")
results_dir = Path("/Volumes/BigData/bio/experiments/aging/loci_of_interest.tables")
results_dir.mkdir(exist_ok=True)

# sorted root:
# sorted_root = Path("/mnt/stripe/bio/experiments/aging/loci.sorted")
sorted_root = Path("/Volumes/BigData/bio/experiments/aging/loci.sorted")

THREADS_N = 8

# Cleanup

In [None]:
#pybedtools.set_tempdir("/tmp")
pybedtools.cleanup()
# !rm {sorted_root}
# !rm {results_dir}

# Known annotations

In [None]:
#root = Path("/mnt/stripe/bio/raw-data/aging/loci_of_interest")
loci_root = Path("/Volumes/BigData/bio/raw-data/aging/loci_of_interest")

#macs2_peaks_root = Path("/mnt/stripe/bio/experiments/aging/peak_calling")
macs2_peaks_root = Path("/Volumes/BigData/bio/experiments/aging/peak_calling") # *.*Peak

# zinbra_peaks_root = Path("/mnt/stripe/bio/experiments/configs/Y20O20/peaks/")
zinbra_peaks_root = Path("/Volumes/BigData/bio/experiments/configs/Y20O20/peaks") # *.bed

signal_root = Path("/mnt/stripe/bio/experiments/signal")

chromhmm_root = loci_root / "chromhmm"

## ChromHMM

In [None]:
chromhmm_paths = list(chromhmm_root.glob('*.bed'))
chromhmm_paths.sort(key=lambda p: int(p.name.split(".")[2].split("_")[0]))

CHROMHMM_ST_MAP = {
    "1_TssA": "Active TSS",
    "2_TssFlnk": "Flanking TSS",
    "3_TssFlnkU": "Flanking TSS Upstream",
    "4_TssFlnkD": "Flanking TSS Downstream",
    "5_Tx": "Strong transcription",
    "6_TxWk": "Weak transcription",
    "7_EnhG1": "Genic enhancer1",
    "8_EnhG2": "Genic enhancer2",
    "9_EnhA1": "Active Enhancer 1",
    "10_EnhA2": "Active Enhancer 2",
    "11_EnhWk": "Weak Enhancer",
    "12_ZNF_Rpts": "ZNF genes & repeats",
    "13_Het": "Heterochromatin",
    "14_TssBiv": "Bivalent/Poised TSS",
    "15_EnhBiv": "Bivalent Enhancer",
    "16_ReprPC": "Repressed PolyComb",
    "17_ReprPCWk": "Weak Repressed PolyComb",
    "18_Quies": "Quiescent/Low",
}

def chromhmm_state_descr(s):
    chunks = s.split(".")
    if len(chunks) <= 2:
        return s
    return CHROMHMM_ST_MAP.get(chunks[2], s)

for i, p in enumerate(chromhmm_paths):
    print(chromhmm_state_descr(p.name), "->", p)

## Loci

In [None]:
loci_paths = sorted([p for p in loci_root.glob('**/*.bed') if not str(p).startswith(str(chromhmm_root))])
for i, p in enumerate(loci_paths):
    print(p)

## Peaks

In [None]:
def donor_order_id(path):
    chunks = path.name.split('_')
    cands = list(filter(lambda s: len(s) > 2 and (s.startswith("OD") or s.startswith("YD")), chunks))
    if len(cands) > 0:
        donor_id = cands[0]
        return (donor_id[:2], int(donor_id[2:]))
    else:
        return (path.name, 0)
    

def collect_peaks(peaks_roots):
    result = {}
    for peaks_root in [x for x in peaks_roots.iterdir() if x.is_dir()]:
        print("Peaks:", peaks_root)

        peaks = list(chain(peaks_root.glob("**/*.bed"), peaks_root.glob("**/*.*Peak")))
        # e.g. 
        # * OD_OD14_H3K27ac_hg19_1.0E-6_peaks.bed
        # * OD8_k27ac_hg19_broad_peaks.broadPeak
        # * zinbra_weak_consensus.bed
        peaks.sort(key=donor_order_id)
        print(len(peaks))    
        print(*[str(p) for p in peaks], sep="\n")
        result[peaks_root.name] = peaks
    return result

In [None]:
macs2_peaks_by_histmod = collect_peaks(macs2_peaks_root)

In [None]:
zinbra_peaks_by_histmod = collect_peaks(zinbra_peaks_root)

## Summary

In [None]:
all_loci = loci_paths + chromhmm_paths

# Code

In [None]:
!which bedtools

In [None]:
# bedtrace.py
def run(commands, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
    """Launches pipe of commands given stdin and final stdout, stderr"""
    processes = []
    _stdin = stdin
    for i, cmd in enumerate(commands):
        if i < len(commands) - 1:
            _stdout = subprocess.PIPE
        else:
            _stdout = stdout
            
        p = subprocess.Popen(cmd, stdin=_stdin, stdout=_stdout,
                             stderr=stderr)
        processes.append(p)
        _stdin = p.stdout

    for i in range(0, len(processes)):
        if i < len(processes) - 1:
            # Allow p1 to receive a SIGPIPE if p2 exits.
            processes[i].stdout.close()
        else:
            return processes[i].communicate()

In [None]:
def as_sorted_bedtool(p: Path, root: Path, sorted_root: Path):
#     if not str(p).startswith(str(root)):
#         # TODO: Not implemented, e.g peaks files
#         stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(p)],))[1]
#         assert len(stderr) == 0, "File: {}\nError:\n{}".format(p, stderr)
#         return pybedtools.bedtool.BedTool(str(p))
    
    sorted_p = sorted_root / p.relative_to(root)
    sorted_p = sorted_p.parent / (sorted_p.stem + ".sorted.bed")

    if not sorted_p.exists():
        sorted_p.parent.mkdir(exist_ok=True, parents=True)
        
        # Do not resort file if already sorted:
        stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(p)],))[1]
        is_sorted = (len(stderr) == 0)
        
        bt = pybedtools.bedtool.BedTool(str(p))
        if not is_sorted:
            print("Sorting: ", str(p))
            # By some reason BedTool.sort() fails to sort cds.csv
            # bt.sort().saveas(sorted_p)
            #stderr = run((["sort", "-c", "-k1,1", "-k2,2n", str(sorted_p)],))[1]
            #assert len(stderr) == 0, "Expected to be sorted: {}\nError:\n{}".format(sorted_p, stderr)
            with open(str(sorted_p), "w") as f:
                run((["sort", "-k1,1", "-k2,2n", str(p)],), stdout=f)
            print("  [Done]", str(sorted_p))
        else:   
            # just copy file
            bt.saveas(sorted_p)
        
    return pybedtools.bedtool.BedTool(str(sorted_p))

In [None]:
from multiprocessing import Pool, TimeoutError

# def run_bedtools_uniq_wc(ij, a, b):
#     output = run((["cat", a.intersect(b, wa=True).fn], ["uniq"], ["wc", "-l"]))
#     return (ij, int(output[0].decode().strip()))

def run_bedtools_uniq_wc(ij, a: pybedtools.BedTool, b: pybedtools.BedTool):
#     a = as_sorted_bedtool(a)
#     b = as_sorted_bedtool(b)
    output = run((["cat", a.intersect(b, wa=True).fn], ["uniq"], ["wc", "-l"]))
    return (ij, int(output[0].decode().strip()))

# def run_bedtools_jaccard(ij, a, b):
#     output = run((["bedtools", "jaccard", "-a", a, "-b", b], ["cut", "-f", "3"]))
#     stdout = output[0].decode().strip()
#     lines = stdout.split("\n")
#     assert len(lines) == 2, lines
#     assert lines[0] == "jaccard"
#     return (ij, int(lines[1]))

def run_bedtools_jaccard(ij, a: pybedtools.BedTool, b: pybedtools.BedTool):
#     a = as_sorted_bedtool(a)
#     b = as_sorted_bedtool(b)
    return (ij, a.jaccard(b)["jaccard"])

def calc_intersection_table(a_paths, b_paths, path_to_bt, threads=4, timeout_hours=10, jaccard=False):   
    path_pairs = []
    for i, a in enumerate(a_paths, 0):
        for j, b in enumerate(b_paths, 1):
            path_pairs.append(((i,j), path_to_bt[a], path_to_bt[b]))

    metric = run_bedtools_jaccard if jaccard else run_bedtools_uniq_wc
    pool = Pool(processes=threads) 
    multiple_results = [pool.apply_async(metric, 
                                         (ij, a, b)) for ij, a, b in path_pairs]
    values = [res.get(timeout=3600*timeout_hours) for res in multiple_results]
    
    x = np.zeros((len(a_paths), 1 + len(b_paths)), np.float32)
    for (i,j), value in values:
        x[i, j] = value
    
    for i, a in enumerate(a_paths, 0):
        output = run((["cat", a],["wc", "-l"],))
        x[i, 0] = int(output[0].decode().strip())
               
    df = pd.DataFrame(x,
                      index=[f.name for f in a_paths],
                      columns=["total"] + [f.name for f in b_paths])
    return df

In [None]:
def plot_heatmap(title, df, path=None, autoscale=False, label_fun=None, figsize=(10,10)):
    plt.figure(figsize=figsize) # or: plt.pcolor(df)
    # ax=plt.subplot()
    
    if autoscale:
        vmin, vmax = None, None
    else:
        vmin, vmax = 0, 1
        
    plt.imshow(df, aspect='auto', cmap="viridis", interpolation="nearest", vmin=vmin, vmax=vmax)
    
    plt.xticks(np.arange(0, len(df.columns), 1),
               df.columns if label_fun is None else [label_fun(s) for s in df.columns],
               rotation='vertical')
    plt.yticks(np.arange(0, len(df.index), 1),
               df.index if label_fun is None else [label_fun(s) for s in df.index])
    plt.colorbar(orientation='vertical')
    plt.title(title)
    if path is None:        
        plt.show()
    else:
        pp.savefig()

In [None]:
def load_intersection_table(beds, loci, path_to_bt, result_path, threads=4, jaccard=False):
    if result_path.exists():
        df = pd.DataFrame.from_csv(result_path)
        print("Loaded: ", result_path)
    else:
        print("Calculating: ", result_path)
        df = calc_intersection_table(beds, loci, path_to_bt, threads=threads, jaccard=jaccard) 
        df.to_csv(str(result_path))
        print("  Saved: ", result_path)
        
    return df

In [None]:
def normalize(df):
    return df.divide(df["total"], axis=0).drop("total", axis=1)

In [None]:
def process_intersection(beds, loci, path_to_bt, results_dir, tag, figsize=(10,10)):
    df_bl = load_intersection_table(beds, loci, path_to_bt, 
                                    results_dir / "{}_bl.csv".format(tag), threads=THREADS_N)
    display(df_bl.head(5))
    
    df_lb = load_intersection_table(loci, beds, path_to_bt,
                                    results_dir / "{}_lb.csv".format(tag), threads=THREADS_N)
    display(df_lb.head(5))
    
    df_n_bl = normalize(df_bl)
    display(df_n_bl.head(5))

    df_n_lb = normalize(df_lb).T
    display(df_n_lb.head(5))

    df_jaccard = load_intersection_table(beds, loci, path_to_bt, 
                                         results_dir / "{}_js.csv".format(tag), threads=THREADS_N,
                                         jaccard = True)
    df_jaccard = df_jaccard.drop("total", axis=1)
    display(df_jaccard.head(5))
    
    plot_heatmap("Metrics: # intervals from row file intersecting any interval from column file",
                 df_n_bl, autoscale=False, label_fun=chromhmm_state_descr, figsize=figsize)
    plot_heatmap("Metrics: # intervals from col file intersecting any interval from row file",
                 df_n_lb, autoscale=False, label_fun=chromhmm_state_descr, figsize=figsize)
    plot_heatmap("Metrics: Geometric mean for intersectiong intervals",
                 np.sqrt(df_n_bl*df_n_lb), autoscale=False, label_fun=chromhmm_state_descr, figsize=figsize)
    plot_heatmap("Metrics: Jaccard",
                 df_jaccard, autoscale=True, label_fun=chromhmm_state_descr, figsize=figsize)

# TMP

In [None]:
#! rm /Volumes/BigData/bio/experiments/aging/loci_of_interest.tables/chromhmm_bl.csv
#! rm /Volumes/BigData/bio/experiments/aging/loci_of_interest.tables/chromhmm_lb.csv
#! rm /Volumes/BigData/bio/experiments/aging/loci_of_interest.tables/chromhmm_js.csv
print("Ensure files sorted...")
mapping = {p:as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest") for p in chromhmm_paths}
print("[Done]")

process_intersection(chromhmm_paths, chromhmm_paths, mapping, results_dir, "chromhmm", figsize=(8,8))

# Loci vs Loci

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest") for p in loci_paths}
print("[Done]")

process_intersection(loci_paths, loci_paths, mapping, results_dir, "loci.csv", figsize=(12,15))

# Loci vs ChromHMM

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest") for p in loci_paths}
mapping.update({p:as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest") for p in chromhmm_paths})
print("[Done]")

process_intersection(loci_paths, chromhmm_paths, mapping, results_dir, "loci_chromhmm", figsize=(8, 15))

# ChromHMM vs ChromHMM

In [None]:
print("Ensure files sorted...")
mapping = {p:as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest") for p in chromhmm_paths}
print("[Done]")
process_intersection(chromhmm_paths, chromhmm_paths, mapping, results_dir, "chromhmm", figsize=(8,8))

# Zinbra vs Loci

## Consensus peaks

In [None]:
consensus_peaks = []
for mod, peaks in zinbra_peaks_by_histmod.items():
    consensus_peaks.extend([p for p in peaks if "consensus" in p.name])

In [None]:
consensus_peaks

In [None]:
print("Ensure files sorted...")
mapping = {}
for p in consensus_peaks:
    mapping[p] = as_sorted_bedtool(p, zinbra_peaks_root, sorted_root / "zinbra")
for p in all_loci:
    mapping[p] = as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

process_intersection(consensus_peaks, all_loci, mapping, results_dir, "zinbra_consensus", figsize=(12,8))

## All Hist mods:

In [None]:
print("Ensure files sorted...")
mapping = {}
for mod, peaks in zinbra_peaks_by_histmod.items():
    for p in peaks:
        mapping[p] = as_sorted_bedtool(p, zinbra_peaks_root, sorted_root / "zinbra")
for p in all_loci:
    mapping[p] = as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

for mod, peaks in zinbra_peaks_by_histmod.items():
    process_intersection(peaks, all_loci, mapping, results_dir, "zinbra_{}".format(mod), figsize=(17,10))

# Macs vs Loci

## All Hist mods:

In [None]:
print("Ensure files sorted...")
mapping = {}
for mod, peaks in macs2_peaks_by_histmod.items():
    for p in peaks:
        mapping[p] = as_sorted_bedtool(p, macs2_peaks_root, sorted_root / "macs2")
for p in all_loci:
    mapping[p] = as_sorted_bedtool(p, loci_root, sorted_root / "loci_of_interest")
print("[Done]")

for mod, peaks in macs2_peaks_by_histmod.items():
    process_intersection(peaks, all_loci, mapping, results_dir, "macs2_{}".format(mod), figsize=(17,10))


# Signal (coverage) vs loci

In [None]:
signal_root

In [None]:
# todo

# TODO

heatmaps

* Have:
    * loci vs loci
    * loci vs ChromHMM
    * Hist.mod consensus vs loci,chromHMM
    * Peaks in hist.mod every donor (OD*, YD*) vs loci,chromHMM

* Todo:
  * raw i-th donor(OD*, YD*) coverage vs loci,chromHMM
  * [Y, O]x[Hist mod] consensus vs loci,chromHMM 
  * Y-O diff vs loci,chromHMM
