# 2023 Immune dataset H3K27ac

Data:
https://www.encodeproject.org/immune-cells/?type=Experiment&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&biosample_ontology.cell_slims=hematopoietic+cell&biosample_ontology.classification=primary+cell&control_type!=*&status!=replaced&status!=revoked&status!=archived&biosample_ontology.system_slims=immune+system&biosample_ontology.system_slims=circulatory+system&config=immune

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file

def d(a, b):
    return a / b if b != 0 else 0


## Load peaks

In [None]:
PATH = os.path.expanduser('~/data/2023_Immune')
IMMUNE_CELLS = ['BCell', 'TCell']
MODIFICATIONS = ['H3K27ac']

def load_peaks_fdr(path, suffix, fdrs):
    df_fdr = pd.DataFrame(columns=['file', 'modification', 'cell', 'fdr', 'peaks', 'avlength'],
                          dtype=object)
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        cell = next((cc for cc in IMMUNE_CELLS if cc in f), None)
        mod = next((m for m in MODIFICATIONS if m in f), None)
        # print(f, fdr, cell, mod)
        if fdr and cell and mod:
            peaks_path = os.path.join(path, f)
            ps, ls = lines(peaks_path), bedl(peaks_path)
            avls = 0 if ps == 0 else sum(ls) / ps
            df_fdr.loc[len(df_fdr)] = (f, mod, cell, fdr, ps, avls)
    return df_fdr

In [None]:
df_fdr_macs2 = load_peaks_fdr(os.path.join(PATH, 'macs2'), '.narrowPeak', ['0.05'])
df_fdr_macs2['file'] = [f'{PATH}/macs2/{f}' for f in df_fdr_macs2['file']]
df_fdr_macs2['tool'] = 'MACS2'
print('MACS2', len(df_fdr_macs2))

df_fdr_macs2broad = load_peaks_fdr(os.path.join(PATH, 'macs2'), '.broadPeak', ['0.1'])
df_fdr_macs2broad['file'] = [f'{PATH}/macs2/{f}' for f in df_fdr_macs2broad['file']]
df_fdr_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_fdr_macs2broad))

df_fdr_sicer = load_peaks_fdr(os.path.join(PATH, 'sicer'), 'summary-FDR', ['0.01'])
df_fdr_sicer['file'] = [f'{PATH}/sicer/{f}' for f in df_fdr_sicer['file']]
df_fdr_sicer['tool'] = 'SICER'
print('SICER', len(df_fdr_sicer))

df_fdr_span = load_peaks_fdr(os.path.join(PATH, 'span'), '.peak', ['0.05'])
df_fdr_span['file'] = [f'{PATH}/span/{f}' for f in df_fdr_span['file']]
df_fdr_span['tool'] = 'SPAN'
print('SPAN', len(df_fdr_span))

df_fdr_peaks = pd.concat([df_fdr_macs2, df_fdr_macs2broad, df_fdr_sicer, df_fdr_span])
df_fdr_peaks.sample(5)

In [None]:
TOOLS = ['MACS2', 'MACS2 broad', 'SICER', 'SPAN', 'DNAse']
palette = plt.cm.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

## Peaks number and lengths

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.barplot(data=df_fdr_peaks, x='tool', y='peaks', ax=ax,
                        capsize=.2, errwidth=2, edgecolor="black",
                        palette=TOOLS_PALETTE)
ax.xaxis.set_tick_params(rotation=90)
ax.title.set_text('H3K27ac peaks number')
plt.show()

In [None]:
DNASE = {
    'BCell': PATH + '/dnaseq/BCell_DNAseq_hg38_ENCFF963BED.bed',
    'TCell': PATH + '/dnaseq/TCell_DNAseq_hg38_ENCFF762IIH.bed',
    'Monocyte': PATH + '/dnaseq/Monocyte_DNAseq_hg38_ENCFF087SET.bed',
}

for cell, dnase_file in DNASE.items():
    print(cell, lines(dnase_file))

In [None]:
ts = []
for dnase_file in DNASE.values():
    t = pd.read_csv(dnase_file, sep='\t', header=None)
    ts.append(pd.DataFrame(dict(name=['DNAse'] * len(t), length=t[2]-t[1])))
for file, tool in tqdm(zip(df_fdr_peaks['file'], df_fdr_peaks['tool'])):
    lengths = bedl(file)
    t = pd.DataFrame(dict(name=[tool] * len(lengths), length=lengths))
    ts.append(t.sample(min(len(t), 10_000)))
t = pd.concat(ts).reset_index(drop=True)
del ts
t.sample(10)

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.boxplot(data=t, x='name', y='length', ax=ax, palette=TOOLS_PALETTE)
ax.title.set_text('H3K27ac peaks length')
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylim([0, 10_000])
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
ax = plt.axes()
g_results = sns.histplot(data=t, x='length', hue='name', ax=ax,
                         stat='density', common_bins=False, common_norm=False,
                         bins=50, kde=True, log_scale=True, alpha=0.2,
                         palette=TOOLS_PALETTE)
g_results.set(xscale='log')
g_results.set_ylim(0, 4.1)
g_results.set_xlim(1e2, 3e4)
ax.title.set_text('H3K27ac peaks length')
plt.show()

In [None]:
# Randomly tweak SPAN, SICER lengths for visualization purposes
tsicer = t['name'] == 'SICER'
t.loc[tsicer, 'length'] += np.random.normal(0, 50, size=sum(tsicer))
tspan = t['name'] == 'SPAN'
t.loc[tspan, 'length'] += np.random.normal(0, 50, size=sum(tspan))

In [None]:
plt.figure(figsize=(8, 5))
ax = plt.axes()
g_results = sns.histplot(data=t, x='length', hue='name', ax=ax,
                         stat='density', common_bins=False, common_norm=False,
                         bins=200, kde=True, log_scale=True, alpha=0.2,
                         palette=TOOLS_PALETTE)
g_results.set(xscale='log')
g_results.set_ylim(0, 2)
g_results.set_xlim(1e2, 3e4)
ax.title.set_text('H3K27ac peaks length')
plt.show()

## Overlap H3K27ac vs DHS

In [None]:
import tempfile
tf = tempfile.mktemp()

bench_df = pd.DataFrame(
    columns=['cell', 'name', 'peaks', 'dnase', 'peaks_overlap', 'dnase_overlap'],
    dtype=object
)

for c in tqdm(IMMUNE_CELLS):
    dnase_file = DNASE[c]
    dnase_peaks = lines(dnase_file)
    print(f'Cell {c} dnase {dnase_peaks}')
    for tool in set(df_fdr_peaks['tool']):
        t = df_fdr_peaks[(df_fdr_peaks['tool'] == tool) &
                         (df_fdr_peaks['modification'] == 'H3K27ac') &
                         (df_fdr_peaks['cell'] == c)]
        if len(t) == 0:
            continue
        peaks_file = t['file'].values[0]
        peaks = lines(peaks_file)
        !bedtools intersect -a {peaks_file} -b {dnase_file} -wa -u > {tf}
        peaks_overlap = lines(tf)
        !bedtools intersect -b {peaks_file} -a {dnase_file} -wa -u > {tf}
        dnase_overlap = lines(tf)
        bench_df.loc[len(bench_df)] = (c, tool, peaks, dnase_peaks, peaks_overlap, dnase_overlap)
bench_df

In [None]:
bench_df['precision'] = [d(a, b) for a, b in zip(bench_df['peaks_overlap'], bench_df['peaks'])]
bench_df['sensitivity'] = [d(a, b) for a, b in zip(bench_df['dnase_overlap'], bench_df['dnase'])]
bench_df['f1'] = [d(2, d(1, s + 1e-10) + d(1, p + 1e-10))
                  for s, p in zip(bench_df['sensitivity'], bench_df['precision'])]

In [None]:
# print('Ignore outliers')
# print(len(bench_df[bench_df['precision'] < 0.1]))
# bench_df = bench_df[bench_df['precision'] >= 0.1]

In [None]:
import seaborn as sns

plt.figure(figsize=(6, 4))
g_results = sns.scatterplot(data=bench_df, x='precision', y='sensitivity', hue='name',
                            palette=TOOLS_PALETTE)
# Put a legend to the right of the current axis
g_results.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
g_results.axes.set_xlabel('Peaks overlapping with dnase (precision)')
g_results.axes.set_ylabel('Dnase overlapping with peaks (sensitivity)')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
g_results = sns.boxplot(data=bench_df, x='name', y='sensitivity',
                        palette=TOOLS_PALETTE)
g_results.axes.set_ylabel('Dnase overlapping with peaks (sensitivity)')
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
g_results = sns.boxplot(data=bench_df, x='name', y='precision',
                        palette=TOOLS_PALETTE)
g_results.axes.set_ylabel('Peaks overlapping with dnase (precision)')
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
sns.boxplot(data=bench_df, x='name', y='f1', palette=TOOLS_PALETTE)
plt.show()

## Overlap H3K27ac diff vs DHS

In [None]:
TOOLS = list(sorted(set(bench_df['name'])))

diff_bench_df = pd.DataFrame(
    columns=['cell', 'name', 'peaks', 'peaks_overlap', 'dnase', 'dnase_overlap'],
    dtype=object
)

tf = tempfile.mktemp()
tf2 = tempfile.mktemp()


for c in tqdm(IMMUNE_CELLS):
    print(c)
    dnase_file = DNASE[c]
    t = df_fdr_peaks[(df_fdr_peaks['tool'] == 'SPAN') &
                     (df_fdr_peaks['modification'] == 'H3K27ac') &
                     (df_fdr_peaks['cell'] == c)]
    if len(t) == 0:
        continue
    span_file = t['file'].values[0]
    dnase_peaks = lines(dnase_file)
    # Processing single tools information
    for tool in TOOLS:
        if tool == 'SPAN':
            continue
        t = df_fdr_peaks[(df_fdr_peaks['tool'] == tool) &
                         (df_fdr_peaks['modification'] == 'H3K27ac') &
                         (df_fdr_peaks['cell'] == c)]
        if len(t) == 0:
            continue
        peaks_file = t['file'].values[0]
        for name, args in [
            (f'SPAN - {tool}', f' -a {span_file} -b {peaks_file} '),
            (f'{tool} - SPAN', f' -b {span_file} -a {peaks_file} ')]:
            !bedtools intersect {args} -wa -v > {tf}
            peaks = lines(tf)
            !bedtools intersect -a {tf} -b {dnase_file} -wa -u > {tf2}
            peaks_overlap = lines(tf2)
            !bedtools intersect -b {tf} -a {dnase_file} -wa -u > {tf2}
            dnase_overlap = lines(tf2)
            diff_bench_df.loc[len(diff_bench_df)] = \
                (c, name, peaks, peaks_overlap, dnase_peaks, dnase_overlap)

display(diff_bench_df.head())

In [None]:
diff_bench_df['precision'] = [d(a, b) for a, b in zip(diff_bench_df['peaks_overlap'], diff_bench_df['peaks'])]
diff_bench_df['sensitivity'] = [d(a, b) for a, b in zip(diff_bench_df['dnase_overlap'], diff_bench_df['dnase'])]
diff_bench_df['f1'] = [d(2, d(1, s + 1e-10) + d(1, p + 1e-10))
                  for s, p in zip(diff_bench_df['sensitivity'], diff_bench_df['precision'])]

In [None]:
full_diff_bench_df = pd.concat([bench_df, diff_bench_df]).reset_index(drop=True)
full_diff_bench_df

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
ax.title.set_text('Number of peaks')
g_results = sns.barplot(data=full_diff_bench_df, x='name', y='peaks',
                        capsize=.2, errwidth=1, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylim(top=60_000)  # Limit for visual aesthetics
ax.set_ylabel('Number')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
ax = plt.axes()
ax.title.set_text('Peaks overlapping with dnase (precision)')
g_results = sns.barplot(data=full_diff_bench_df, x='name', y='precision',
                        capsize=.2, errwidth=1, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
# Put a legend to the right of the current axis
# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
ax = plt.axes()
ax.title.set_text('Dnase overlapping with peaks (sensitivity)')
g_results = sns.barplot(data=full_diff_bench_df, x='name', y='sensitivity',
                        capsize=.2, errwidth=1, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
# Put a legend to the right of the current axis
# ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
ax.title.set_text('Peaks overlapping with DNAse')
g_results = sns.barplot(data=full_diff_bench_df, x='name', y='precision',
                        capsize=.2, errwidth=1, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
ax.title.set_text('Dnase overlapping with peaks')
g_results = sns.barplot(data=full_diff_bench_df, x='name', y='sensitivity',
                        capsize=.2, errwidth=1, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
plt.tight_layout()
plt.show()

# Hg38 Top peaks overlap with DHS

In [None]:
import tempfile
from itertools import product

tf = tempfile.mktemp()
tf2 = tempfile.mktemp()

benchmark_top_df = pd.DataFrame(
    columns=['cell', 'top', 'dnase', 'peaks_file', 'peaks', 'pg', 'gp', 'tool'],
    dtype=object
)

for c in tqdm(IMMUNE_CELLS):
    print(c)
    if c not in DNASE:
        continue
    dnase_file = DNASE[c]
    dnase_peaks = lines(dnase_file)
    for tool in set(df_fdr_peaks['tool']):
        t = df_fdr_peaks[(df_fdr_peaks['tool'] == tool) &
                         (df_fdr_peaks['modification'] == 'H3K27ac') &
                         (df_fdr_peaks['cell'] == c)]
        if len(t) == 0:
            continue
        peaks_file = t['file'].values[0]
        peaks = lines(peaks_file)
        t = pd.read_csv(peaks_file, sep='\t', header=None)
        t.sort_values(by=[8] if len(t.columns) >= 9 else [4], ascending=False, inplace=True)
        for top in np.linspace(1000, 30000, 30):
            t.head(int(top)).sort_values(by=[0, 1]).to_csv(tf, sep='\t', index=False, header=None)
            peaks = lines(tf)
            ! bedtools intersect -a {tf} -b {dnase_file} -wa -u > {tf2}
            peaks_overlap = lines(tf2)
            ! bedtools intersect -b {tf} -a {dnase_file} -wa -u > {tf2}
            dnase_overlap = lines(tf2)
            benchmark_top_df.loc[len(benchmark_top_df)] = \
                (c, top, dnase_peaks, peaks_file, peaks, peaks_overlap, dnase_overlap, tool)

benchmark_top_df

In [None]:
benchmark_top_df['p'] = (benchmark_top_df['pg'] + benchmark_top_df['gp']) / 2
benchmark_top_df['precision'] = [d(x, y) for x, y in zip(benchmark_top_df['pg'], benchmark_top_df['peaks'])]
benchmark_top_df['sensitivity'] = [d(x, y) for x, y in zip(benchmark_top_df['gp'], benchmark_top_df['dnase'])]
benchmark_top_df['f1'] = [2 / (d(1, s + 1e-10) + d(1, p + 1e-10)) for s, p in zip(benchmark_top_df['sensitivity'], benchmark_top_df['precision'])]
benchmark_top_df

In [None]:
def rgb2hex(color):
    r, g, b, _ = color
    return "#{0:02x}{1:02x}{2:02x}".format(int(r * 255), int(g * 255), int(b * 255))

PLOTLY_TOOLS_PALETTE = {k: rgb2hex(v) for k, v in TOOLS_PALETTE.items()}

In [None]:
import plotly.graph_objects as go

def plot_top(benchmark_top_df):
    tools_legend_shown = set()
    fig = go.Figure()

    for c, t in product(IMMUNE_CELLS, df_fdr_peaks['tool'].unique()):
        dft = benchmark_top_df[(benchmark_top_df['cell'] == c) &
                           (benchmark_top_df['tool'] == t)]
        if len(dft) == 0:
            continue
        fig.add_trace(go.Scatter(
            x=dft["precision"], y=dft["sensitivity"], mode='lines+markers', name=t,
            hovertext=dft['top'].astype(str) + ' ' + t,
            showlegend=t not in tools_legend_shown,
            marker_color=PLOTLY_TOOLS_PALETTE[t],
            opacity=0.3,
        ))
        tools_legend_shown.add(t)


    fig.update_xaxes(range=[-0.1, 1.1], title='Peaks overlapping dnase (sensitivity)')
    fig.update_yaxes(range=[-0.1, 1.1], title='Dnase overlapping peaks (precision)')

    fig.layout.template = 'plotly_white'
    fig.update_layout(
        autosize=False,
        width=800,
        height=600,)
    fig.show()


In [None]:
plot_top(benchmark_top_df)

In [None]:
plot_top(benchmark_top_df[benchmark_top_df['cell'] == 'TCell'])