# 2018 ABF SPAN



In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
import tempfile

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file


def d(a, b):
    return a / b if b != 0 else 0


def last_col(file):
    try:
        cols = len(pd.read_csv(file, sep='\t', nrows=1, header=None).columns)
        return pd.read_csv(file, sep='\t', header=None, usecols=[cols - 1])[cols - 1]
    except:
        return np.zeros(0)  # Empty file


def sorted_file(file):
    ts = tempfile.mktemp()
    ! cat {file} | sort -k1,1 -k2,2n -k3,3n > {ts}
    return ts

In [None]:
def load_peaks(path, suffix, modifications, replicates):
    df = pd.DataFrame(columns=['file', 'modification', 'replicate', 'peaks'],
                      dtype=object)
    for f in tqdm(os.listdir(path)):
        if not f.endswith(suffix):
            continue
        mod = next((m for m in modifications if m.lower() in f.lower()), None)
        rep = next((r for r in replicates if f'{r.lower()}_' in f.lower()), None)
        if mod and rep is not None:
            peaks_path = os.path.join(path, f)
            ps = lines(peaks_path)
            df.loc[len(df)] = (peaks_path, mod, rep, ps)
    return df


# Load peaks

In [None]:
PATH = os.path.expanduser('~/data/2018_chipseq_y20o20')
MODIFICATIONS = ['k4me3', 'k27ac', 'k4me1', 'k27me3', 'k36me3']
REPLICATES = [f'OD{i}' for i in range(1, 30)] + [f'YD{i}' for i in range(1, 30)]

In [None]:
df_macs2 = load_peaks(os.path.join(PATH, 'macs2'), '.narrowPeak', MODIFICATIONS, REPLICATES, )
df_macs2['tool'] = 'MACS2'
print('MACS2', len(df_macs2))

df_macs2broad = load_peaks(os.path.join(PATH, 'macs2'), '.broadPeak', MODIFICATIONS, REPLICATES, )
df_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_macs2broad))

df_sicer = load_peaks(os.path.join(PATH, 'sicer'), '-FDR0.01', MODIFICATIONS, REPLICATES, )
df_sicer['tool'] = 'SICER'
print('SICER', len(df_sicer))

df_span = load_peaks(os.path.join(PATH, 'span'), '.peak', MODIFICATIONS, REPLICATES, )
df_span['tool'] = 'SPAN'
print('SPAN', len(df_span))

df_macs2broad_abf = load_peaks(os.path.join(PATH, 'macs2_abf'), '.broadPeak', MODIFICATIONS, REPLICATES, )
df_macs2broad_abf['tool'] = 'MACS2 ABF'
print('MACS2 broad ABF', len(df_macs2broad_abf))

df_sicer_abf = load_peaks(os.path.join(PATH, 'sicer_abf'), '', MODIFICATIONS, REPLICATES, )
df_sicer_abf['tool'] = 'SICER ABF'
print('SICER ABF', len(df_sicer_abf))

df_span_tuned = load_peaks(os.path.join(PATH, 'span_tuned'), '.bed', MODIFICATIONS, REPLICATES, )
df_span_tuned['tool'] = 'SPAN tuned'
print('SPAN tuned', len(df_span_tuned))

df_peaks = pd.concat([df_macs2broad_abf, df_sicer_abf, df_span_tuned,
                      df_macs2, df_macs2broad,  df_sicer, df_span]).reset_index(drop=True)
df_peaks.sample(3)

In [None]:
plt.figure(figsize=(12, 3))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    ax = axs[i]
    dfm = df_peaks[df_peaks['modification'] == m]
    sns.barplot(data=dfm, x='tool', y='peaks',
                order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                capsize=.2, err_kws={'linewidth': 2}, edgecolor="black",
                ax=ax)
    sns.stripplot(data=dfm, x='tool', y='peaks',
                  order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                  size=1, color='black', alpha=0.5,
                  ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.title.set_text(m)
plt.tight_layout()
plt.show()

In [None]:
# Drop outliers
OUTLIERS = {'k27ac': ['YD1', 'YD6'],
           'k27me3': ['OD9', 'YD1', 'YD2', 'YD3', 'YD4', 'YD9', 'YD10', 'YD11'],
           'k36me3': ['OD3', 'OD6', 'OD12', 'OD18', 'OD20', 'YD1', 'YD3', 'YD4', 'YD5'],
           'k4me3': ['OD6', 'OD7', 'OD14', 'YD2', 'YD3', 'YD10', 'YD14']}
t_outliers = [any(f'{o}_h3{m}'.lower() in f.lower() or f'{o}_{m}'.lower() in f.lower() 
                  for m, ol in OUTLIERS.items() for o in ol)
              for f in df_peaks['file']]
print('Total outliers', sum(t_outliers))
df_peaks_no = df_peaks[[not to for to in t_outliers]].copy()

In [None]:
df_peaks_no['tool'].unique()

In [None]:
# plt.figure(figsize=(1, 2))
# sns.boxplot(data=df_peaks_no, x='tool', y='peaks',
#             # order=['MACS2', 'SICER', 'SPAN tuned', 'SPAN 0.05', 'SPAN 1e-4'],
#             )
# # plt.title('K36me3')
# plt.tight_layout()
# plt.show()

In [None]:
plt.figure(figsize=(12, 3))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    ax = axs[i]
    dfm = df_peaks_no[df_peaks_no['modification'] == m]
    # sns.boxplot(data=dfm, x='tool', y='peaks',
    #             order=['MACS2', 'SICER', 'SPAN tuned', 'SPAN'],
    #             showfliers=True,
    #             ax=ax)
    sns.barplot(data=dfm, x='tool', y='peaks',
                order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                capsize=.2, edgecolor="black",
                err_kws={'linewidth': 2},
                ax=ax)
    sns.stripplot(data=dfm, x='tool', y='peaks',
                  order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                  size=1.5, alpha=0.5, color='black',
                  ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.title.set_text(m)
    ax.set_ylim(0, min(1.2e5, dfm['peaks'].max()))
plt.tight_layout()
plt.show()

# Lengths

In [None]:
print('Load lengths')
ts = []
for mod, rep, file, tool in tqdm(zip(
        df_peaks_no['modification'], df_peaks_no['replicate'], df_peaks_no['file'],
        df_peaks_no['tool']
)):
    lengths = bedl(file)
    t = pd.DataFrame(dict(length=lengths))
    t = t.sample(min(len(t), 10_000)).copy()
    t['modification'] = mod
    t['replicate'] = rep
    t['tool'] = tool
    t['footprint'] = sum(lengths)
    ts.append(t)
df_lens = pd.concat(ts).reset_index(drop=True)
del ts, t
df_lens.sample(10)

In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(12, 3.5))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    print(m)
    ax = axs[i]
    sns.boxplot(data=df_lens[df_lens['modification'] == m], x='tool', y='length',
                showfliers=False,
                order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'SICER', 'SPAN'],
                ax=ax)
    ax.title.set_text(m)
    if m not in ['H3K36me3']:
        ax.set_ylim(bottom=20, top=1e5)
    else:
        ax.set_ylim(bottom=20, top=1e5)
    ax.set(yscale='log')
    ax.xaxis.set_tick_params(rotation=90)

plt.tight_layout()
plt.show()

# Jaccard

In [None]:
from itertools import product
import pyranges as pr

def compute_reps_overlap(df_peaks, modifications, fixed=-1):
    reps_overlap = pd.DataFrame(columns=['modification', 'tool', 'rep1', 'rep2',
                                         'peaks1', 'peaks1_len', 'peaks2', 'peaks2_len',
                                         'peaks1_overlap', 'peaks2_overlap', 'peaks_overlap_len'], dtype=object)

    tools = list(sorted(set(df_peaks['tool'])))
    for m in modifications:
        tm = df_peaks[(df_peaks['modification'] == m)]
        reps = list(sorted(set(tm['replicate'])))
        for tool in tools:
            print(m, tool, len(reps))
            files = [None] * len(reps)
            peaks = [-1] * len(reps)
            peaks_lens = [-1] * len(reps)
            # print('Load')
            for i in range(len(reps)):
                t1 = tm[(tm['tool'] == tool) & (tm['replicate'] == reps[i])]
                if len(t1) == 0:
                    continue
                file = t1['file'].values[0]
                peaks[i] = lines(file)
                if peaks[i] == 0:
                    peaks_lens[i] = 0
                    continue
                files[i] = pr.read_bed(sorted_file(file))
                peaks_lens[i] = files[i].lengths().sum()
            # print('Intersect')
            for i, j in tqdm(list(product(range(len(reps)), range(len(reps))))):
                if fixed != -1 and i != fixed or i >= j:
                    continue
                rep1, rep2 = reps[i], reps[j]
                file1, file2 = files[i], files[j]
                peaks1, peaks2 = peaks[i], peaks[j]
                peaks1_len, peaks2_len = peaks_lens[i], peaks_lens[j]
                if peaks1 <= 0 or peaks2 <= 0:
                    reps_overlap.loc[len(reps_overlap)] = \
                        (m, tool, rep1, rep2, peaks1, peaks1_len, peaks2, peaks2_len, 
                         0, 0, 0)
                    continue
                overlap1 = len(file1.overlap(file2))
                overlap2 = len(file2.overlap(file1))
                overlap_len = file1.intersect(file2).lengths().sum()
                # assert file2.intersect(file1).lengths().sum() == overlap_len
                reps_overlap.loc[len(reps_overlap)] = \
                    (m, tool, rep1, rep2, peaks1, peaks1_len, peaks2, peaks2_len,
                        overlap1, overlap2, overlap_len)
    reps_overlap['jaccard'] = [
        d(lo, l1 + l2 - lo)
        for l1, l2, lo in zip(reps_overlap['peaks1_len'], reps_overlap['peaks2_len'], reps_overlap['peaks_overlap_len'])
    ]
    return reps_overlap

In [None]:
df_overlap_full = compute_reps_overlap(df_peaks_no, MODIFICATIONS)

In [None]:
plt.figure(figsize=(6, 3))
ax = plt.axes()
sns.boxplot(data=df_overlap_full, x='modification', hue='tool', y='jaccard',
            hue_order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            showfliers=False,
            ax=ax)
ax.set_title('Jaccard between replicates')
ax.set_ylabel('Jaccard')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
t = pd.concat([df_overlap_full[['modification', 'tool']],
               df_overlap_full[['modification', 'tool']]]).reset_index(drop=True)
overlaps = [d(o, p) for o, p in zip(df_overlap_full['peaks1_overlap'], df_overlap_full['peaks1'])] + \
           [d(o, p) for o, p in zip(df_overlap_full['peaks2_overlap'], df_overlap_full['peaks2'])]
t['overlap'] = overlaps
plt.figure(figsize=(6, 3))
ax = plt.axes()
sns.boxplot(data=t, x='modification', hue='tool', y='overlap',
            hue_order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            showfliers=False
            )
ax.set_title('Overlap between replicates')
ax.set_ylabel('Overlap')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=45)
plt.tight_layout()
plt.show()

# Comparison vs ENCODE CD14 data

```
cd ~/data/2024_cd14_monocytes_chipseq
snakemake -p -s ~/work/chipseq-smk-pipeline/Snakefile all --cores all  --use-conda  --directory $(pwd) --config genome=hg38 fastq_dir=$(pwd) start_with_bams=True bams_dir=bams span=True span_params="--debug --keep-cache" span_threads=2 --rerun-trigger mtime 
```
and use liftover to hg19
```
for F in sicer/*hg38*FDR0.01; do echo $F; NF=${F/hg38/hg19}; echo $NF; liftOver -bedPlus=6 $F hg38ToHg19.over.chain $NF ${NF}_unmapped; done
for F in macs2/*hg38*Peak; do echo $F; NF=${F/hg38/hg19}; echo $NF; liftOver -bedPlus=6 $F hg38ToHg19.over.chain $NF ${NF}_unmapped; done
for F in span/*hg38*.peak; do echo $F; NF=${F/hg38/hg19}; echo $NF; liftOver -bedPlus=6 $F hg38ToHg19.over.chain $NF ${NF}_unmapped; done
```

In [None]:
CD14_ENCODE_PATH = os.path.expanduser('~/data/2024_cd14_monocytes_chipseq')
df_cd14_macs2 = load_peaks(os.path.join(CD14_ENCODE_PATH, 'macs2'), '.narrowPeak', MODIFICATIONS, [''], )
df_cd14_macs2['tool'] = 'MACS2'
print('MACS2', len(df_cd14_macs2))

df_cd14_macs2broad = load_peaks(os.path.join(CD14_ENCODE_PATH, 'macs2'), '.broadPeak', MODIFICATIONS, [''], )
df_cd14_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_cd14_macs2broad))

df_cd14_sicer = load_peaks(os.path.join(CD14_ENCODE_PATH, 'sicer'), '-FDR0.01', MODIFICATIONS, [''], )
df_cd14_sicer['tool'] = 'SICER'
print('SICER', len(df_cd14_sicer))

df_cd14_span = load_peaks(os.path.join(CD14_ENCODE_PATH, 'span'), '.peak', MODIFICATIONS, [''], )
df_cd14_span['tool'] = 'SPAN'
print('SPAN', len(df_cd14_span))

df_cd14 = pd.concat([df_cd14_macs2, df_cd14_macs2broad, df_cd14_sicer, df_cd14_span]).reset_index(drop=True)
# Ignore multiple versions for several modifications
df_cd14 = df_cd14[(df_cd14['file'].str.contains('GSM1102807')) & (df_cd14['file'].str.contains('hg19'))]
df_cd14.sample(3)

In [None]:
plt.figure(figsize=(10, 3))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    ax = axs[i]
    dfm = df_cd14[df_cd14['modification'] == m]
    sns.barplot(data=dfm, x='tool', y='peaks',
                order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                capsize=.2, edgecolor="black",
                err_kws={'linewidth': 2},
                ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.title.set_text(m)
plt.tight_layout()
plt.show()

In [None]:
df_peaks_no['dataset'] = 'Y20O20'
df_cd14['dataset'] = 'ENCODE'
t = pd.concat([df_peaks_no, df_cd14]).reset_index(drop=True)

plt.figure(figsize=(14, 3))
axs = [plt.subplot(1, 5, i + 1) for i in range(5)]
for i, m in enumerate(MODIFICATIONS):
    ax = axs[i]
    dfm = t[t['modification'] == m]
    sns.barplot(data=dfm, x='tool', y='peaks', hue='dataset',
                order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                capsize=.2, edgecolor="black",
                err_kws={'linewidth': 2},
                ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.title.set_text(m)
    # ax.set_ylim(0, min(1.5e5, dfm['peaks'].max()))
    if i == len(axs) - 1:
        sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    else:
        ax.legend().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
t = df_cd14[df_cd14['tool'].isin(['MACS2', 'SPAN'])].copy()
t['replicate'] = t['tool']
t['tool'] = 'MACS2&SPAN'
df_cd14_overlap = compute_reps_overlap(t, MODIFICATIONS)
df_cd14_overlap

In [None]:
print('ENCODE')
plt.figure(figsize=(5, 3))
ax = plt.axes()
sns.barplot(data=df_cd14_overlap, x='modification', hue='tool', y='jaccard',
            ax=ax)
ax.set_title('Jaccard between CD14 ENCODE MACS2 & SPAN')
ax.set_ylabel('Jaccard')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
rows = []
for _, row in df_cd14_overlap.iterrows():
    rows.append((row['modification'], 'macs2_by_span', d(row['peaks1_overlap'], row['peaks1'])))
    rows.append((row['modification'], 'span_by_macs2', d(row['peaks2_overlap'], row['peaks2'])))
t = pd.DataFrame(rows, columns=['modification', 'name', 'overlap'])
del rows
plt.figure(figsize=(5, 3))
ax = plt.axes()
sns.barplot(data=t, x='modification', hue='name', y='overlap',
            ax=ax)
ax.set_title('Overlap between CD14 ENCODE MACS2 & SPAN')
ax.set_ylabel('Overlap')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
ts = []
for (m, tool), dfmt in df_peaks_no.groupby(['modification', 'tool']):
    if tool in ['MACS2', 'MACS2 ABF']:
        encode_tool = 'MACS2'
    elif tool == 'MACS2 broad':
        encode_tool = 'MACS2 broad'
    elif tool in ['SICER', 'SICER ABF']:
        encode_tool = 'SICER'
    elif tool in ['SPAN', 'SPAN tuned']:
        encode_tool = 'SPAN'
    else:
        continue
    t = df_cd14[(df_cd14['modification'] == m) & (df_cd14['tool'] == encode_tool)].copy().reset_index(drop=True)
    t['replicate'] = 'ENCODE'
    t['tool'] = tool
    ts.append(t)
    ts.append(dfmt.copy().reset_index(drop=True))
df_joint = pd.concat(ts).reset_index(drop=True)
df_joint.head(5)

In [None]:
df_joint_overlap = compute_reps_overlap(df_joint, MODIFICATIONS, fixed=0)
df_joint_overlap.sample(3)

In [None]:
plt.figure(figsize=(8, 3))
ax = plt.axes()
sns.barplot(data=df_joint_overlap, x='modification', y='jaccard', hue='tool',
            hue_order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            ax=ax)
ax.set_title('Jaccard between Y20O20 and CD14 ENCODE ')
ax.set_ylabel('Jaccard')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
def update_abf_modifications(df):
    df.loc[df['modification'] == 'k4me3', 'modification'] = 'H3K4me3'
    df.loc[df['modification'] == 'k27ac', 'modification'] = 'H3K27ac'
    df.loc[df['modification'] == 'k4me1', 'modification'] = 'H3K4me1'
    df.loc[df['modification'] == 'k27me3', 'modification'] = 'H3K27me3'
    df.loc[df['modification'] == 'k36me3', 'modification'] = 'H3K36me3'


TOOLS = ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']
t = df_joint_overlap[df_joint_overlap['tool'].isin(TOOLS)].copy()

palette = plt.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

update_abf_modifications(t)

plt.figure(figsize=(6, 3))
ax = plt.axes()
sns.barplot(data=t, x='modification', y='jaccard', hue='tool',
            hue_order=TOOLS,
            palette=TOOLS_PALETTE,
            capsize=.2, err_kws={'linewidth': 2},
            ax=ax)
sns.stripplot(data=t, x='modification', y='jaccard', hue='tool',
                  dodge=True, size=2, palette='dark:black', alpha=0.4,
              legend=False,
                  hue_order=TOOLS,
                  ax=ax)
ax.set_title('Jaccard between ULI and ENCODE')
ax.set_ylabel('Jaccard')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df_joint_overlap['precision'] = [d(po, p) for po, p in zip(df_joint_overlap['peaks1_overlap'], df_joint_overlap['peaks1'])]
df_joint_overlap['recall'] = [d(po, p) for po, p in zip(df_joint_overlap['peaks2_overlap'], df_joint_overlap['peaks2'])]
df_joint_overlap['f1'] = [2 / (1 / p + 1 / r) if min(p, r) > 0 else 0 for p, r in zip(df_joint_overlap['precision'], df_joint_overlap['recall'])]

In [None]:
plt.figure(figsize=(8, 3))
ax = plt.axes()
sns.barplot(data=df_joint_overlap, x='modification', y='f1', hue='tool',
            hue_order=['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            ax=ax)
ax.set_title('F1 between Y20O20 and CD14 ENCODE ')
ax.set_ylabel('F1')
ax.set_xlabel('Modification')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
ax.xaxis.set_tick_params(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Join
rows = []
for _, row in df_joint_overlap.iterrows():
    rows.append((row['modification'], row['tool'], 'ENCODE_by_Y20O20', d(row['peaks1_overlap'], row['peaks1'])))
    rows.append((row['modification'], row['tool'], 'Y20O20_by_ENCODE', d(row['peaks2_overlap'], row['peaks2'])))
overlap_df = pd.DataFrame(rows, columns=['modification', 'tool', 'name', 'overlap'])
del rows
overlap_df.sample(3)

In [None]:
plt.figure(figsize=(20, 3))
axs = [plt.subplot(1, 7, i + 1) for i in range(7)]
for i, tool in enumerate(['MACS2 ABF', 'SICER ABF', 'SPAN tuned', 'MACS2', 'MACS2 broad', 'SICER', 'SPAN']):
    print(tool)
    ax = axs[i]
    sns.barplot(data=overlap_df[overlap_df['tool'] == tool], x='modification', hue='name', y='overlap',
                ax=ax)
    ax.set_title(f'Overlap {tool}')
    ax.set_ylabel('Overlap')
    ax.set_xlabel('Modification')
    ax.set_ylim(0, 1)
    if i == len(axs) - 1:
        sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
    else:
        ax.legend().set_visible(False)
    ax.xaxis.set_tick_params(rotation=90)
plt.tight_layout()
plt.show()

# Consensus peaks vs ChromHMM markup

In [None]:
CONSENSUS_PEAKS = {
    m: f'{PATH}/{m}_consensus_union50%.bed' for m in MODIFICATIONS
}
for m in MODIFICATIONS:
    ! wc -l {CONSENSUS_PEAKS[m]}
    pass

In [None]:
print('Split chromHMM into separate files')
# CHROMHMM_FILE = f'{PATH}/cd14_chromhmm_hg19_ENCFF808WWL.bed'
CHROMHMM_FILE = f'{PATH}/cd14_chromhmm_hg19_ENCFF072SCA.bed'
CHROMM_STATES = {}
t = pd.read_csv(CHROMHMM_FILE, sep='\t', header=None, usecols=[3])[3]
for state in t.unique():
    state = state.replace('/', '_')
    f = CHROMHMM_FILE.replace('.bed', f'_{state}.bed')
    CHROMM_STATES[state] = f
    ! cat {CHROMHMM_FILE} | grep {state} > {f}

In [None]:
tf = tempfile.mktemp()

rows = []
for m in ['k4me1', 'k4me3', 'k27ac', 'k27me3', 'k36me3']:
    print(m)
    peaks_file = sorted_file(CONSENSUS_PEAKS[m]) 
    peaks = lines(peaks_file)
    row = [m, peaks]
    for state, state_file in tqdm(CHROMM_STATES.items()):
        ! bedtools intersect -b {peaks_file} -a {state_file} -wa -u > {tf}
        state_peaks = lines(tf)
        row.append(state_peaks)
    rows.append(row)

chromhmm_ovlp_df = pd.DataFrame(rows, columns=['modification', 'peaks'] + list(CHROMM_STATES.keys()))
chromhmm_ovlp_df

In [None]:
import re
chromhmm_rel_df = chromhmm_ovlp_df.copy()
for state in CHROMM_STATES.keys():
    chromhmm_rel_df[state] = chromhmm_rel_df[state] / lines(CHROMM_STATES[state]) if lines(CHROMM_STATES[state]) > 0 else 0
chromhmm_rel_df.set_index('modification', inplace=True)
chromhmm_rel_df.rename({c: re.sub('[0-9]+_', '', c) for c in chromhmm_rel_df.columns}, axis=1, inplace=True)
chromhmm_rel_df = chromhmm_rel_df[['TssA', 'TssFlnk', 'TssFlnkU', 'TssFlnkD', 'Tx', 'TxWk',
                                   'EnhG1', 'EnhG2', 'EnhA1', 'EnhA2', 'EnhWk', 'ZNF_Rpts',
                                   'Het', 'TssBiv', 'EnhBiv', 'ReprPC', 'ReprPCWk', 'Quies']]
chromhmm_rel_df

In [None]:
plt.figure(figsize=(6, 2.5))
ax = plt.axes()
sns.heatmap(chromhmm_rel_df, cmap='coolwarm', ax=ax)
plt.title('Overlap of consensus peaks with ChromHMM')
ax.xaxis.set_tick_params(rotation=90)
plt.tight_layout()
plt.show()