# Figure6 Donors Overlap HeatMaps

** Plots **
* ** Overlap ** consistency heatmap
    * ZINBRA Tuned
    * ZINBRA Default
    * Golden (MACS2 + SICER) Default
* ** Jaccard ** consistency heatmap
    * ZINBRA Tuned
    * ZINBRA Default
    * Golden (MACS2 + SICER) Default

* Out of scope: ZINBRA No Input; ZINBRA Split

** Input** : 
* Peak Calling Summary - aging.tsv

# Overlap: Y20O20, Zinbra

In [45]:
# Ensure project configured
! echo $WASHU_ROOT
! echo $PYTHONPATH

/Users/romeo/work/washu
/Users/romeo/work/washu:


In [103]:
from collections import OrderedDict, Counter
from itertools import repeat
import os

import pandas as pd
from pathlib import Path

import downstream.bed_metrics as bm
import downstream.loci_of_interest as loi

%matplotlib inline
%config InlineBackend.figure_format='retina'
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

N_THREADS = 8
CLEAR_CACHE = False

In [50]:
donors_df = pd.read_csv("/mnt/stripe/bio/experiments/figures/Peak Calling Summary - aging.tsv", sep='\t')
donors_df.head()

Unnamed: 0,donor,modification,tool,peaks,length,FRiP,procedure,params,file,tags,duplicates,Status
0,OD1,H3K27ac,MACS2,21843,25197959,16.0923,default,--broad --broad-cutoff 1.0E-4,/mnt/stripe/bio/experiments/aging/peak_calling...,31047948,0.14,
1,OD1,H3K27ac,MACS2,21843,25197959,16.0923,tuned,--broad --broad-cutoff 1.0E-4,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,31047948,0.14,
2,OD1,H3K27ac,ZINBRA,20927,27474600,16.4653,default,--fdr 1.0E-6 --gap 5,/mnt/stripe/bio/experiments/aging/peak_calling...,31047948,0.14,
3,OD1,H3K27ac,ZINBRA,24760,37517000,18.2379,tuned,--fdr 1.0E-4 --gap 2,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,31047948,0.14,
4,OD10,H3K27ac,MACS2,30860,39691972,28.9157,default,--broad --broad-cutoff 1.0E-4,/mnt/stripe/bio/experiments/aging/peak_calling...,41258333,0.3,


In [51]:
# File path example:
donors_df.loc[0, "file"]

'/mnt/stripe/bio/experiments/aging/peak_calling/H3K27ac/macs_broad/OD1_k27ac_hg19_broad_peaks.broadPeak'

In [55]:
donors_df[(donors_df.modification == "H3K27ac") & (donors_df.donor == "YD6")]

Unnamed: 0,donor,modification,tool,peaks,length,FRiP,procedure,params,file,tags,duplicates,Status
144,YD6,H3K27ac,MACS2,6462,4512432,1.55396,default,--broad --broad-cutoff 1.0E-4,/mnt/stripe/bio/experiments/aging/peak_calling...,43279118,0.07,Failed
145,YD6,H3K27ac,MACS2,9327,7185293,2.16177,tuned,--broad --broad-cutoff 0.01,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,43279118,0.07,Failed
146,YD6,H3K27ac,ZINBRA,1794,14473800,0.468753,default,--fdr 1.0E-6 --gap 5,/mnt/stripe/bio/experiments/aging/peak_calling...,43279118,0.07,Failed
147,YD6,H3K27ac,ZINBRA,17511,15848000,3.44402,tuned,--fdr 1.0E-4 --gap 10,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,43279118,0.07,Failed


In [117]:
def color_annotator_hist(label):
    chunks = list(label.split("_"))
    hist, _donor, _tool = chunks
    color = {'H3K4me1':'lightblue', 'H3K4me3':'red', 'H3K27ac':'black',
             'H3K27me3':'green', 'H3K36me3':'lightgray'}[hist]

    return (("Histone", color),)

def build_donros_heatmap(donors_df, tools_config, title, metric_df_path, *, 
                         tuned = True, overlap_metric = True, hide_failed_tracks = False,
                         row_cluster=False, plot_path = None, threads = 4):
    procedure = "tuned" if tuned else "default"
    
    hists = tools_config.keys()
    
    # Collect data: -----------------------------------
    all_paths = []
    all_labels = []
    all_passed = []
    print("Input Data:".format(" only passed tracks" if hide_failed_tracks else ""))
    for h in hists:
        tool = tools_config[h]
        mask = (donors_df.modification == h) & (donors_df.tool == tool) & (donors_df.procedure == procedure)
        
        paths = [Path(p) for p in donors_df[mask].file]
        labels = ["{}_{}_{}".format(h, d, tool) for d in donors_df[mask].donor]
        passed = donors_df[mask].Status != "Failed"
        print("  {}: {} {} [passed: {}, failed: {}]".format(h, procedure, tool, sum(passed), sum(~passed)))
        
        all_paths.extend(paths)
        all_labels.extend(labels)
        all_passed.extend(passed)
        
    # Metric: -----------------------------------
    print("\nMetrics table:")
    print(metric_df_path)
    if CLEAR_CACHE:
        if os.path.exists(metric_df_path):
            os.remove(metric_df_path)
            
    df = bm.load_or_build_metrics_table(all_paths, all_paths, Path(str(metric_df_path)),
                                        jaccard=not overlap_metric, threads=threads)
    assert df.index.tolist() == df.columns.tolist()
    df.index = all_labels
    df.columns = all_labels

    if hide_failed_tracks:
        df = df.loc[all_passed, all_passed]

    # Plot: -----------------------------------
    metrics = "Jaccard" if not overlap_metric else "Overlap"
    annotator = color_annotator_hist
    
    g = bm.plot_metric_heatmap(metrics, #IM: {}".format(df_path.name), 
                               df,
                               save_to=plot_path,
                               row_cluster=row_cluster, 
                               col_cluster=False,
                               row_color_annotator=annotator,
                               col_color_annotator=annotator,
                               row_colors_ratio=0.025, col_colors_ratio=0.025,
                               # todo y/o
                               figsize=(10, 10), cbar=False,
                               show_or_save_plot=False)

    # replace donors names with hist modifications:
    filtered_labels = df.columns 
    hist_donors_cnt = Counter()
    for l in filtered_labels:
        hist = l.split("_")[0]
        hist_donors_cnt[hist] += 1
    
    hist_donors_counts = [hist_donors_cnt[h] for h in hists]
    
    ticks = [sum(hist_donors_counts[0:k]) + hist_donors_counts[k]/2 for k in range(len(hist_donors_counts))]
    g.ax_heatmap.set_xticks(ticks)
    g.ax_heatmap.set_xticklabels(hists, rotation="horizontal", horizontalalignment = 'center')
    g.ax_heatmap.set_yticks(ticks)
    g.ax_heatmap.set_yticklabels(hists, rotation="vertical", verticalalignment = 'center')

    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=90)
    
    # Turn off annotations
    g.ax_col_colors.set_yticks([])
    g.ax_row_colors.set_xticks([])

    g.ax_col_colors.set_title(title)
    
    # Turn off Y lables:
    if row_cluster == True:
        g.ax_heatmap.set_yticks([])
        
    bm.save_plot(plot_path)

# Overlap

## Tuned ZINBRA

In [None]:
build_donros_heatmap(
    donors_df,
    OrderedDict(zip(['H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K27me3', 'H3K36me3'], repeat("ZINBRA"))),
    "Tunned ZINBRA Overlap",
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_overlap_tunned_zinbra.csv",
    plot_path="/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_overlap_tunned_zinbra.pdf",
    tuned = True, overlap_metric = True, 
    threads=N_THREADS, hide_failed_tracks=True
)

build_donros_heatmap(
    donors_df,
    OrderedDict(zip(['H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K27me3', 'H3K36me3'], repeat("ZINBRA"))),
    "Tunned ZINBRA Overlap",
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_overlap_tunned_zinbra.csv",
    tuned = True, overlap_metric = True,
    threads=N_THREADS, hide_failed_tracks=True
)

## Default ZINBRA

In [None]:
build_donros_heatmap(
    donors_df,
    OrderedDict(zip(['H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K27me3', 'H3K36me3'], repeat("ZINBRA"))),
    "Default ZINBRA Overlap",
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_overlap_default_zinbra.csv",
    tuned = False, overlap_metric = True,
    threads=N_THREADS, hide_failed_tracks=True
)

## Default GOLDEN

In [None]:
build_donros_heatmap(
    donors_df,
    OrderedDict([
        *list(zip(['H3K4me1', 'H3K4me3', 'H3K27ac'], repeat("MACS2"))),
        *list(zip(['H3K27me3', 'H3K36me3'], repeat("SICER")))
    ]),
    "Default MACS2 & SICER Overlap",
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_overlap_default_golden.csv",
    tuned = False, overlap_metric = True,
    threads=N_THREADS, hide_failed_tracks=True
)

# Jaccard

## Tuned ZINBRA

In [None]:
build_donros_heatmap(
    donors_df,
    OrderedDict(zip(['H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K27me3', 'H3K36me3'], repeat("ZINBRA"))),
    "Tuned ZINBRA Jaccard",
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_jaccard_tuned_zinbra.csv",
    tuned = True, overlap_metric = False,
    threads=N_THREADS, hide_failed_tracks=True
)

## Default ZINBRA

In [None]:
build_donros_heatmap(
    donors_df,
    OrderedDict(zip(['H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K27me3', 'H3K36me3'], repeat("ZINBRA"))),
    "Default ZINBRA Jaccard",
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_jaccard_default_zinbra.csv",
    tuned = False, overlap_metric = False,
    threads=N_THREADS, hide_failed_tracks=True
)

## Default GOLDEN

In [None]:
build_donros_heatmap(
    donors_df,
    OrderedDict([
        *list(zip(['H3K4me1', 'H3K4me3', 'H3K27ac'], repeat("MACS2"))),
        *list(zip(['H3K27me3', 'H3K36me3'], repeat("SICER")))
    ]),
    "Default MACS2 & SICER Jaccard",,
    "/mnt/stripe/bio/experiments/configs/benchmark/benchmark/heatmap_jaccard_default_golden.csv",
    tuned = False, overlap_metric = False,
    threads=N_THREADS, hide_failed_tracks=True
)