# SPAN Benchmarks H3K4me3 vs RNA-seq

See https://pubmed.ncbi.nlm.nih.gov/27169896/


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file

def d(a, b):
    return a / b if b != 0 else 0

# Hg38 load peaks

In [None]:
GSE26320_PATH_HG38 = os.path.expanduser('~/data/2023_GSE26320')
GSE26320_CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF', 'H1', 'Huvec', 'HepG2']
# GSE26320_CELLS = ['GM12878',  'K562', 'H1']
# GSE26320_MODIFICATIONS = ['CTCF', 'H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K9ac', 'H4K20me1']
GSE26320_MODIFICATIONS = ['H3K4me3']
GSE26320_REPS = ['rep1', 'rep2']

In [None]:

def load_peaks_fdr(path, suffix, fdrs):
    df_fdr = pd.DataFrame(columns=['file', 'modification', 'cell', 'replicate', 'fdr', 'peaks', 'avlength'],
                          dtype=object)
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        cell = next((cc for cc in GSE26320_CELLS if cc in f), None)
        mod = next((m for m in GSE26320_MODIFICATIONS if m in f), None)
        rep = 'rep1' if 'rep1' in f else 'rep2'
        if fdr and cell and rep and mod:
            peaks_path = os.path.join(path, f)
            ps, ls = lines(peaks_path), bedl(peaks_path)
            avls = 0 if ps == 0 else sum(ls) / ps
            df_fdr.loc[len(df_fdr)] = (f, mod, cell, rep, fdr, ps, avls)
    return df_fdr

In [None]:
df_fdr_macs2 = load_peaks_fdr(os.path.join(GSE26320_PATH_HG38, 'macs2'), '.narrowPeak', ['0.05'])
df_fdr_macs2['file'] = [f'{GSE26320_PATH_HG38}/macs2/{f}' for f in df_fdr_macs2['file']]
df_fdr_macs2['tool'] = 'MACS2'
print('MACS2', len(df_fdr_macs2))

df_fdr_macs2broad = load_peaks_fdr(os.path.join(GSE26320_PATH_HG38, 'macs2'), '.broadPeak', ['0.1'])
df_fdr_macs2broad['file'] = [f'{GSE26320_PATH_HG38}/macs2/{f}' for f in df_fdr_macs2broad['file']]
df_fdr_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_fdr_macs2broad))

df_fdr_sicer = load_peaks_fdr(os.path.join(GSE26320_PATH_HG38, 'sicer'), 'summary-FDR', ['0.01'])
df_fdr_sicer['file'] = [f'{GSE26320_PATH_HG38}/sicer/{f}' for f in df_fdr_sicer['file']]
df_fdr_sicer['tool'] = 'SICER'
print('SICER', len(df_fdr_sicer))

df_fdr_span = load_peaks_fdr(os.path.join(GSE26320_PATH_HG38, 'span'), '.peak', ['0.05'])
df_fdr_span['file'] = [f'{GSE26320_PATH_HG38}/span/{f}' for f in df_fdr_span['file']]
df_fdr_span['tool'] = 'SPAN'
print('SPAN', len(df_fdr_span))

df_fdr_peaks = pd.concat([df_fdr_macs2, df_fdr_macs2broad, df_fdr_sicer, df_fdr_span])
df_fdr_peaks.sample(5)

In [None]:
# TOOLS_PALETTE = {'MACS2': 'blue', 'MACS2 broad': 'orange', 'SICER': 'green', 'SPAN': 'red', 'Genes': 'brown'}
TOOLS = ['MACS2', 'MACS2 broad', 'SICER', 'SPAN', 'Genes']
palette = plt.cm.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

In [None]:
# plt.figure(figsize=(4, 4))
# ax = plt.axes()
# g_results = sns.boxplot(data=df_fdr_peaks, x='tool', y='peaks', ax=ax,
#                         palette=TOOLS_PALETTE)
# ax.xaxis.set_tick_params(rotation=90)
# ax.title.set_text('H3K4me3 peaks number')
# plt.show()

plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.barplot(data=df_fdr_peaks, x='tool', y='peaks', ax=ax,
                        capsize=.2, errwidth=2, edgecolor="black",
                        palette=TOOLS_PALETTE)
ax.xaxis.set_tick_params(rotation=90)
ax.title.set_text('H3K4me3 peaks number')
plt.show()

In [None]:
ts = []
for file, tool in tqdm(zip(df_fdr_peaks['file'], df_fdr_peaks['tool'])):
    lengths = bedl(file)
    t = pd.DataFrame(dict(tool=[tool] * len(lengths), length=lengths))
    ts.append(t.sample(min(len(t), 10_000)))
t = pd.concat(ts).reset_index(drop=True)
del ts
t.sample(10)

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.boxplot(data=t, x='tool', y='length', ax=ax)
ax.title.set_text('H3K4me3 peaks length')
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylim([0, 10_000])
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
ax = plt.axes()
g_results = sns.histplot(data=t, x='length', hue='tool', ax=ax,
                         stat='density', common_bins=False, common_norm=False,
                         kde=True, log_scale=True, alpha=0.2,
                         palette=TOOLS_PALETTE)
g_results.set(xscale='log')
g_results.set_ylim(0, 3)
g_results.set_xlim(1e2, 2e4)
ax.title.set_text('H3K4me3 peaks length')
plt.show()

In [None]:
# Randomly tweak SPAN,SICER lengths for visualization purposes
tspan = t['tool'] == 'SPAN'
t.loc[tspan, 'length'] += np.random.normal(0, 100, size=sum(tspan))
tsicer = t['tool'] == 'SICER'
t.loc[tsicer, 'length'] += np.random.normal(0, 100, size=sum(tsicer))

In [None]:
plt.figure(figsize=(8, 5))
ax = plt.axes()
g_results = sns.histplot(data=t, x='length', hue='tool', ax=ax,
                         stat='density', common_bins=False, common_norm=False,
                         kde=True, log_scale=True, alpha=0.2,
                         palette=TOOLS_PALETTE)
g_results.set(xscale='log')
g_results.set_ylim(0, 3)
g_results.set_xlim(1e2, 2e4)
ax.title.set_text('H3K4me3 peaks length')
plt.show()

## Consistency analysis between replicates

In [None]:
import tempfile

def overlap_by_length(file1, file2, min_overlap):
    tf = tempfile.mktemp()
    !bedtools intersect -a {file1} -b {file2} -wo > {tf}
    try:
        peaks1 = pd.read_csv(file1, sep='\t', header=None, nrows=1)
        ncol1 = len(peaks1.columns)
    except:
        # Empty file
        ncol1 = None
    try:
        peaks2 = pd.read_csv(file2, sep='\t', header=None, nrows=1)
        ncol2 = len(peaks2.columns)
    except:
        # Empty file
        ncol2 = None

    if ncol1 is not None and ncol2 is not None:
        try:
            overlap = pd.read_csv(tf, sep='\t', header=None)
        except:
            return 0, 0, 0
        overlap = overlap[overlap[ncol1 + ncol2] > min_overlap]
        overlap1 = overlap[[0, 1, 2, ncol1 + ncol2]].groupby([0, 1, 2]).aggregate(sum).reset_index()
        overlap2 = overlap[[ncol1, ncol1 + 1, ncol1 + 2, ncol1 + ncol2]]. \
            groupby([ncol1, ncol1 + 1, ncol1 + 2]).aggregate(sum).reset_index()
        return len(overlap1), len(overlap2), overlap2[ncol1 + ncol2].sum()
    else:
        return 0, 0, 0

In [None]:
from itertools import product

reps_overlap = pd.DataFrame(columns=['modification', 'cell', 'tool', 'rep1', 'rep2',
                                     'peaks1', 'peaks2', 'overlap', 'peaks1_overlap', 'peaks2_overlap'], dtype=object)

tools = list(sorted(set(df_fdr_peaks['tool'])))
for c, m in tqdm(product(GSE26320_CELLS, GSE26320_MODIFICATIONS)):
    print(c, m)
    tm = df_fdr_peaks[(df_fdr_peaks['cell'] == c) & (df_fdr_peaks['modification'] == m)]
    reps = list(sorted(set(tm['replicate'])))
    for tool in tools:
        for i in range(len(reps)):
            for j in range(i + 1, len(reps)):
                rep1, rep2 = reps[i], reps[j]
                t1 = tm[(tm['tool'] == tool) & (tm['replicate'] == rep1)]
                t2 = tm[(tm['tool'] == tool) & (tm['replicate'] == rep2)]
                file1 = t1['file'].values[0]
                file2 = t2['file'].values[0]
                peaks1 = t1['peaks'].values[0]
                peaks2 = t2['peaks'].values[0]
                overlap1, overlap2, _ = overlap_by_length(file1, file2, 100)
                reps_overlap.loc[len(reps_overlap)] = \
                    (m, c, tool, rep1, rep2, peaks1, peaks2, '100bp', overlap1, overlap2)
                for overlap, overlap_param in [
                    ('1bp', ''),
                    ('50%', ' -f 0.5 ')
                ]:
                    overlap1 = !bedtools intersect -a {file1} -b {file2} -wa -u {overlap_param} | wc -l
                    overlap1 = int(overlap1[0])
                    overlap2 = !bedtools intersect -b {file1} -a {file2} -wa -u {overlap_param} | wc -l
                    overlap2 = int(overlap2[0])

                    reps_overlap.loc[len(reps_overlap)] = \
                        (m, c, tool, rep1, rep2, peaks1, peaks2, overlap, overlap1, overlap2)

reps_overlap['peak1_overlap_fraction'] = [d(x, y) for x, y in zip(reps_overlap['peaks1_overlap'], reps_overlap['peaks1'])]
reps_overlap['peak2_overlap_fraction'] = [d(x, y) for x, y in zip(reps_overlap['peaks2_overlap'], reps_overlap['peaks2'])]
reps_overlap

In [None]:
t = pd.concat([
    reps_overlap[
        ['modification', 'cell', 'tool', 'peak1_overlap_fraction', 'overlap']
    ].copy().rename(dict(peak1_overlap_fraction='value'), axis=1),
    reps_overlap[
        ['modification', 'cell', 'tool', 'peak2_overlap_fraction', 'overlap']
    ].copy().rename(dict(peak2_overlap_fraction='value'), axis=1)
]).reset_index(drop=True)
t

In [None]:
plt.figure(figsize=(len(GSE26320_MODIFICATIONS) * 4, 3))
for k, m in enumerate(GSE26320_MODIFICATIONS):
    ax = plt.subplot(1, len(GSE26320_MODIFICATIONS), k + 1)
    sns.boxplot(data=t[t['modification'] == m], x='tool', y='value', hue='overlap', ax=ax)
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_title(m)
    ax.set_xlabel('Tool')
    if k == 0:
        ax.set_ylabel('Overlap')
    else:
        ax.set_ylabel(None)
    if k == len(GSE26320_MODIFICATIONS) - 1:
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.81))
    else:
        ax.get_legend().remove()
# plt.savefig(f'{GSE26320_PATH_HG38}/analyze/overlap.pdf', bbox_inches='tight', dpi=300)
plt.show()

# H3K4me3 vs RNA-seq

Benchmark according to the paper https://doi.org/10.1093/bib/bbw035
Features that define the best ChIP-seq peak calling algorithms

We considered a peak as positive if it overlaps the promoter of an expressed gene (RPKM > 0.5).
The top 15 000 peak calls from the different methods are ranked by their significance or by their fold enrichment for the thresholding method. We plotted the correct peak fraction (fraction of the top 1000xn peaks that overlap with active promoters) detected as a function of the correct promoter fraction (fraction of the active promoters that overlap with the top 1000xn peaks).

## Load hg38 gtf

In [None]:
gtf_df = pd.read_csv(os.path.expanduser(
    '~/data/2023_Immune/gencode.GRCh38.p13.v41.annotation.gtf'),
    sep='\t', comment='#',
    names=['chromosome', 'db', 'type', 'start', 'end', 'point1', 'strand', 'point2', 'aux'])
gtf_df.sample(10)

In [None]:
import re

print('Parse GTF aux data')
auxes = {}
for i, aux in enumerate(tqdm(gtf_df['aux'])):
    for pair in aux.split(';'):
        kv = pair.strip().split(' ')
        if len(kv) != 2:
            continue
        k, v = kv
        if k not in auxes:
            auxes[k] = vs = []
        else:
            vs = auxes[k]
        vs.append(v.strip('"'))

for k, vs in auxes.items():
    if len(vs) == len(gtf_df):
        gtf_df[k] = vs
    else:
        print(f'Ignoring {k}')
del auxes
gtf_df.drop('aux', axis=1, inplace=True)

# Fix . in gene_id
gtf_df['gene_id'] = [re.sub('\..*', '', id) for id in gtf_df['gene_id']]

In [None]:
print(f'Total hg38 records {len(gtf_df)}')
print(f'Total hg38 genes {sum(gtf_df["type"] == "gene")}')
print(f'Total hg38 protein_coding genes {sum((gtf_df["type"] == "gene") & (gtf_df["gene_type"] == "protein_coding"))}')

gtf_genes_df = gtf_df[gtf_df['type'] == 'gene']
gtf_genes_df.sample(5)

## Annotate quanitified RNA-seq with TSS positions

In [None]:
import re

GSE26320_RNASEQ_PATH = os.path.expanduser('~/data/2023_GSE26320_rnaseq')

EXPRESSION_TSV = {
    ('GM12878', 'rep1') : 'GM12878_rep1_RNAseq_hg38_ENCFF723ICA.tsv',
    ('GM12878', 'rep2'): 'GM12878_rep2_RNAseq_hg38_ENCFF418FIT.tsv',
    ('H1', 'rep1') : 'H1_rep1_RNAseq_hg38_ENCFF093NEQ.tsv',
    ('H1', 'rep2') : 'H1_rep1_RNAseq_hg38_ENCFF093NEQ.tsv',
    ('HMEC', 'rep1') : 'HMEC_rep1_RNAseq_hg38_ENCFF292FVY.tsv',
    ('HMEC', 'rep2') : 'HMEC_rep2_RNAseq_hg38_ENCFF219EZH.tsv',
    ('HSMM', 'rep1') : 'HSMM_rep1_RNAseq_hg38_ENCFF551YKI.tsv',
    ('HSMM', 'rep2') : 'HSMM_rep2_RNAseq_hg38_ENCFF497BUG.tsv',
    ('HepG2', 'rep1') : 'HepG2_rep1_RNAseq_hg38_ENCFF773JNC.tsv',
    ('HepG2', 'rep2') : 'HepG2_rep2_RNAseq_hg38_ENCFF570ZOT.tsv',
    ('Huvec', 'rep1') : 'Huvec_rep1_RNAseq_hg38_ENCFF454MTF.tsv',
    ('Huvec', 'rep2') : 'Huvec_rep2_RNAseq_hg38_ENCFF134FYP.tsv',
    ('K562', 'rep1') : 'K562_rep1_RNAseq_hg38_ENCFF139IXQ.tsv',
    ('K562', 'rep2') : 'K562_rep2_RNAseq_hg38_ENCFF088RDE.tsv',
    ('NHEK', 'rep1') : 'NHEK_rep1_RNAseq_hg38_ENCFF004JWA.tsv',
    ('NHEK', 'rep2') : 'NHEK_rep1_RNAseq_hg38_ENCFF004JWA.tsv',
    ('NHLF', 'rep1') : 'NHLF_rep1_RNAseq_hg38_ENCFF577DEC.tsv',
    ('NHLF', 'rep2') : 'NHLF_rep2_RNAseq_hg38_ENCFF412KNK.tsv'
}

EXPRESSION_TSS_BED = {}
TSS = 5000
THRESHOLD = 0.05

for (cell, rep), path in tqdm(EXPRESSION_TSV.items()):
    print(cell, rep, path)
    t = pd.read_csv(GSE26320_RNASEQ_PATH + '/' + path, sep='\t')
    # Fix . in gene_id
    t['gene_id'] = [re.sub('\..*', '', id) for id in t['gene_id']]
    t['LTPM'] = np.log1p(t['TPM'])

    # Plot expression
    plt.figure(figsize=(10, 1))
    ax = plt.axes()
    g_results = sns.histplot(data=t, x='LTPM', ax=ax,
                             stat = 'density',
                             bins = 500, kde = True, alpha = 0.4)
    ax.axvline(x=THRESHOLD, color='green')
    ax.title.set_text(f'{cell} {rep} expression')
    plt.show()

    # Save expressed genes
    full_df = pd.merge(left=t, right=gtf_genes_df, left_on='gene_id', right_on='gene_id')
    print(f'Total genes with positions {len(full_df)}')
    full_df = full_df[full_df['LTPM'] > THRESHOLD]
    print(f'Genes logTPM > {THRESHOLD} {len(full_df)}')

    starts = [
        max(1, start - TSS if strand == '+' else end - TSS)
        for start, end, strand in zip(full_df['start'], full_df['end'], full_df['strand'])
    ]
    ends = [
        start + TSS if strand == '+' else end + TSS
        for start, end, strand in zip(full_df['start'], full_df['end'], full_df['strand'])
    ]
    full_df['start'] = starts
    full_df['end'] = ends
    full_df = full_df[['chromosome', 'start', 'end', 'gene_id', 'strand', 'TPM', 'LTPM']]
    full_df.sort_values(by=['chromosome', 'start'], inplace=True)

    # Plot expressed genes lengths
    # plt.figure(figsize=(4, 2))
    # ax = plt.axes()
    # g_results = sns.histplot(x=full_df['end']-full_df['start'], ax=ax,
    #                          stat='density',
    #                          bins=100, log_scale=True, kde=True, alpha=0.4)
    # g_results.set(xscale='log')
    # ax.title.set_text(f'{cell} genes lengths')
    # plt.show()

    expr_path = GSE26320_RNASEQ_PATH + '/' + path.replace('.tsv', f'_tpm{THRESHOLD}_tss{TSS}.bed')
    full_df.to_csv(expr_path, sep='\t', index=False, header=None)
    print(f'Saved expressed genes fom {cell} to {expr_path}')
    EXPRESSION_TSS_BED[(cell, rep)] = expr_path


## Overlap H3K4me3 with genes

In [None]:
import tempfile
from itertools import product

tf = tempfile.mktemp()

bench_df = pd.DataFrame(
    columns=['cell', 'replicate', 'name', 'overlap', 'peaks', 'genes', 'peaks_overlap', 'genes_overlap'],
    dtype=object
)

for c, r in tqdm(product(GSE26320_CELLS, GSE26320_REPS)):
    print(c, r)
    if (c, r) not in EXPRESSION_TSS_BED:
        continue
    expr_file = EXPRESSION_TSS_BED[c, r]
    genes = lines(expr_file)
    for tool in set(df_fdr_peaks['tool']):
        t = df_fdr_peaks[(df_fdr_peaks['tool'] == tool) & (df_fdr_peaks['modification'] == 'H3K4me3') &
                         (df_fdr_peaks['cell'] == c) & (df_fdr_peaks['replicate'] == r)]
        if len(t) == 0:
            continue
        peaks_file = t['file'].values[0]
        peaks = lines(peaks_file)
        peaks_overlap, genes_overlap, _ = overlap_by_length(peaks_file, expr_file, 100)
        bench_df.loc[len(bench_df)] = (c, r, tool, '100bp', peaks, genes, peaks_overlap, genes_overlap)
        for overlap, overlap_param in [
            ('1bp', ''),
            ('50%', ' -f 0.5 '),
        ]:
            !bedtools intersect -a {peaks_file} -b {expr_file} -wa -u {overlap_param} > {tf}
            peaks_overlap = lines(tf)
            !bedtools intersect -b {peaks_file} -a {expr_file} -wa -u {overlap_param} > {tf}
            genes_overlap = lines(tf)
            bench_df.loc[len(bench_df)] = (c, r, tool, overlap, peaks, genes, peaks_overlap, genes_overlap)

bench_df

In [None]:
bench_df['precision'] = [d(a, b) for a, b in zip(bench_df['peaks_overlap'], bench_df['peaks'])]
bench_df['sensitivity'] = [d(a, b) for a, b in zip(bench_df['genes_overlap'], bench_df['genes'])]
bench_df['f1'] = [d(2, d(1, s + 1e-10) + d(1, p + 1e-10))
                  for s, p in zip(bench_df['sensitivity'], bench_df['precision'])]

In [None]:
print('Ignore outliers')
print(len(bench_df[bench_df['precision'] < 0.1]))
bench_df = bench_df[bench_df['precision'] >= 0.1]

In [None]:
import seaborn as sns

plt.figure(figsize=(6, 4))
g_results = sns.scatterplot(data=bench_df, x='precision', y='sensitivity', hue='name', style='overlap')
# Put a legend to the right of the current axis
g_results.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
g_results.axes.set_xlabel('Peaks overlapping with genes (precision)')
g_results.axes.set_xlim([0, 1])
g_results.axes.set_ylim([0, 1])
g_results.axes.set_ylabel('Genes overlapping with peaks (sensitivity)')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
g_results = sns.boxplot(data=bench_df, x='overlap', y='sensitivity', hue='name')
g_results.axes.set_ylabel('Genes overlapping with peaks (sensitivity)')
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
g_results = sns.boxplot(data=bench_df, x='overlap', y='precision', hue='name')
g_results.axes.set_ylabel('Peaks overlapping with genes (precision)')
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
sns.boxplot(data=bench_df, x='overlap', y='f1', hue='name')
plt.show()

## Overlap diff H3K4me3 vs RNA-seq

In [None]:
TOOLS = list(sorted(set(bench_df['name'])))

diff_bench_df = pd.DataFrame(
    columns=['cell', 'replicate', 'name', 'overlap', 'peaks', 'peaks_overlap', 'genes', 'genes_overlap'],
    dtype=object
)

tf = tempfile.mktemp()
tf2 = tempfile.mktemp()

for c, r in tqdm(product(GSE26320_CELLS, GSE26320_REPS)):
    if (c, r) not in EXPRESSION_TSS_BED:
        continue
    print(c, r)
    expr_file = EXPRESSION_TSS_BED[(c, r)]
    t = df_fdr_peaks[(df_fdr_peaks['tool'] == 'SPAN') & (df_fdr_peaks['modification'] == 'H3K4me3') &
                     (df_fdr_peaks['cell'] == c) & (df_fdr_peaks['replicate'] == r)]
    if len(t) == 0:
        continue
    span_file = t['file'].values[0]
    genes = lines(expr_file)
    # Processing single tools information
    for tool in TOOLS:
        if tool == 'SPAN':
            continue
        t = df_fdr_peaks[(df_fdr_peaks['tool'] == tool) & (df_fdr_peaks['modification'] == 'H3K4me3') &
                         (df_fdr_peaks['cell'] == c) & (df_fdr_peaks['replicate'] == r)]
        if len(t) == 0:
            continue
        peaks_file = t['file'].values[0]
        for name, args in [
            (f'SPAN - {tool}', f' -a {span_file} -b {peaks_file} '),
            (f'{tool} - SPAN', f' -b {span_file} -a {peaks_file} ')]:
            !bedtools intersect {args} -wa -v > {tf}
            peaks = lines(tf)
            peaks_overlap, genes_overlap, _ = overlap_by_length(tf, expr_file, 100)
            diff_bench_df.loc[len(diff_bench_df)] = \
                (c, r, name, '100bp', peaks, peaks_overlap, genes, genes_overlap)
            for overlap, overlap_param in [
                ('1bp', ''),
                ('50%', '-f 0.5'),
            ]:
                !bedtools intersect -a {tf} -b {expr_file} -wa -u {overlap_param} > {tf2}
                peaks_overlap = lines(tf2)
                !bedtools intersect -b {tf} -a {expr_file} -wa -u {overlap_param} > {tf2}
                genes_overlap = lines(tf2)
                diff_bench_df.loc[len(diff_bench_df)] = \
                    (c, r, name, overlap, peaks, peaks_overlap, genes, genes_overlap)

display(diff_bench_df.head())

In [None]:
diff_bench_df['precision'] = [d(a, b) for a, b in zip(diff_bench_df['peaks_overlap'], diff_bench_df['peaks'])]
diff_bench_df['sensitivity'] = [d(a, b) for a, b in zip(diff_bench_df['genes_overlap'], diff_bench_df['genes'])]
diff_bench_df['f1'] = [d(2, d(1, s + 1e-10) + d(1, p + 1e-10))
                       for s, p in zip(diff_bench_df['sensitivity'], diff_bench_df['precision'])]

In [None]:
full_bench_df = pd.concat([
    bench_df[['cell', 'replicate', 'name', 'peaks', 'overlap', 'precision', 'sensitivity']],
    diff_bench_df[['cell', 'replicate', 'name', 'peaks', 'overlap', 'precision', 'sensitivity']]]).reset_index(drop=True)
full_bench_df

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
ax.title.set_text('Peaks number')
g_results = sns.barplot(data=full_bench_df[full_bench_df['overlap'] == '1bp'], x='name', y='peaks',
                        ax=ax,
                        capsize=.2, errwidth=2,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ]
                        )
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Peaks number')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
ax = plt.axes()
ax.title.set_text('Peaks overlapping with genes (precision)')
g_results = sns.barplot(data=full_bench_df, x='name', y='precision',
                        hue='overlap',
                        capsize=.2, errwidth=2, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
ax = plt.axes()
ax.title.set_text('Genes overlapping with peaks (sensitivity)')
g_results = sns.barplot(data=full_bench_df, x='name', y='sensitivity',
                        hue='overlap',
                        capsize=.2, errwidth=2, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
ax.title.set_text('Genes overlapping with peaks')
g_results = sns.barplot(data=full_bench_df[full_bench_df['overlap'] == '1bp'], x='name', y='sensitivity',
                        capsize=.2, errwidth=2, ax=ax,
                        order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN',
                               'SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER',
                               'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN',
                               ])
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Fraction')
plt.tight_layout()
plt.show()

## Hg38 Recovered promoter fraction / Correct peak fraction

In [None]:
import tempfile
from itertools import product

tf = tempfile.mktemp()
tf2 = tempfile.mktemp()

benchmark_df = pd.DataFrame(
    columns=['cell', 'replicate', 'top', 'genes', 'peaks_file', 'peaks', 'pg', 'gp', 'tool'],
    dtype=object
)

for c, r in tqdm(product(GSE26320_CELLS, GSE26320_REPS)):
    print(c, r)
    if (c, r) not in EXPRESSION_TSS_BED:
        continue
    expr_file = EXPRESSION_TSS_BED[c, r]
    genes = lines(expr_file)
    for tool in set(df_fdr_peaks['tool']):
        t = df_fdr_peaks[(df_fdr_peaks['tool'] == tool) & (df_fdr_peaks['modification'] == 'H3K4me3') &
                         (df_fdr_peaks['cell'] == c) & (df_fdr_peaks['replicate'] == r)]
        if len(t) == 0:
            continue
        peaks_file = t['file'].values[0]
        peaks = lines(peaks_file)
        t = pd.read_csv(peaks_file, sep='\t', header=None)
        t.sort_values(by=[8] if len(t.columns) >= 9 else [4], ascending=False, inplace=True)
        for top in np.linspace(1000, 15000, 15):
            t.head(int(top)).sort_values(by=[0, 1]).to_csv(tf, sep='\t', index=False, header=None)
            peaks = lines(tf)
            ! bedtools intersect -a {tf} -b {expr_file} -wa -u > {tf2}
            peaks_overlap = lines(tf2)
            ! bedtools intersect -b {tf} -a {expr_file} -wa -u > {tf2}
            genes_overlap = lines(tf2)
            benchmark_df.loc[len(benchmark_df)] = \
                (c, r, top, genes, peaks_file, peaks, peaks_overlap, genes_overlap, tool)

benchmark_df

In [None]:
benchmark_df['p'] = (benchmark_df['pg'] + benchmark_df['gp']) / 2
benchmark_df['precision'] = [d(x, y) for x, y in zip(benchmark_df['pg'], benchmark_df['peaks'])]
benchmark_df['sensitivity'] = [d(x, y) for x, y in zip(benchmark_df['gp'], benchmark_df['genes'])]
benchmark_df['f1'] = [2 / (d(1, s + 1e-10) + d(1, p + 1e-10)) for s, p in zip(benchmark_df['sensitivity'], benchmark_df['precision'])]
benchmark_df

In [None]:
import plotly.graph_objects as go

def plot_top(benchmark_df):
    tools_legend_shown = set()
    fig = go.Figure()

    for c, r, t in product(GSE26320_CELLS, GSE26320_REPS, df_fdr_peaks['tool'].unique()):
        dft = benchmark_df[(benchmark_df['cell'] == c) & (benchmark_df['replicate'] == r) &
                           (benchmark_df['tool'] == t)]
        if len(dft) == 0:
            continue
        fig.add_trace(go.Scatter(
            x=dft["precision"], y=dft["sensitivity"], mode='lines+markers', name=t,
            hovertext=dft['top'].astype(str) + ' ' + t,
            showlegend=t not in tools_legend_shown,
            marker_color=TOOLS_PALETTE[t],
            opacity=0.3,
        ))
        tools_legend_shown.add(t)

    for c, r, t in product(GSE26320_CELLS, GSE26320_REPS, df_fdr_peaks['tool'].unique()):
        dft = benchmark_df[(benchmark_df['cell'] == c) & (benchmark_df['replicate'] == r) &
                           (benchmark_df['tool'] == t)]
        if len(dft) == 0:
            continue
        fig.add_trace(
            go.Scatter(
                mode='markers',
                x=dft["precision"], y=dft["sensitivity"],
                name=t,
                marker=dict(color='white', size=5, line=dict(width=1)),
                showlegend=False,
                hovertext=dft['top'].astype(str) + ' ' + t
            )
        )

    # fig.update_xaxes(range=[-0.1, 1.1], title='Peaks overlapping active genes (precision)')
    # fig.update_yaxes(range=[-0.1, 1.1], title='Active genes overlapping peaks (sensitivity)')
    fig.update_xaxes(title='Correct promoter fraction (sensitivity)')
    fig.update_yaxes(title='Recovered promoter fraction (precision)')

    fig.layout.template = 'plotly_white'
    fig.update_layout(
        autosize=False,
        width=1200,
        height=800,)
    fig.show()


In [None]:
plot_top(benchmark_df[(benchmark_df['cell'] == 'GM12878') & (benchmark_df['replicate'] == 'rep1')])

In [None]:
plot_top(benchmark_df)

# hg19 Recovered promoter fraction / Correct peak fraction

In [None]:
df = pd.read_csv(os.path.expanduser('~/data/2022_GSE26320_GM12878_chipseq/k4me3_report.tsv'), sep='\t',
                 names=['n', 'file', 'peaks', 'tss_peaks', 'cp', 'rp'])
df['cpf'] = df['cp'] / df['peaks']
df['rpf'] = df['rp'] / df['tss_peaks']
df

In [None]:
def detect_tool(file):
    if '.narrowPeak' in file:
        return 'MACS2'
    elif '.broadPeak' in file:
        return 'MACS2 broad'
    elif '.peak' in file:
        return 'SPAN'
    else:
        return 'SICER'


df['tool'] = [detect_tool(f) for f in df['file']]
df['dir'] = ['macs2' if 'MACS2' in tool else 'sicer' if tool == 'SICER' else 'SPAN' for tool in df['tool']]

In [None]:
# import plotly.express as px
#
# fig = px.line(df, x="rpf", y="cpf", color="tool", hover_name="n")
# fig.update_xaxes(range=[0, 1], row=1, col=1, title='recovered promoter fraction')
# fig.update_yaxes(range=[0, 1], row=1, col=1, title='correct peak fraction')
# fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
for t in set(df['tool']):
    dft = df[df['tool'] == t]
    fig.add_trace(go.Scatter(
        x=dft["rpf"], y=dft["cpf"], mode='lines+markers', name=t, hovertext=dft['n']))
fig.update_xaxes(range=[0, 0.8], title='Recovered Promoter Fraction')
fig.update_yaxes(range=[0, 1], title='Correct Peak Fraction')
fig.layout.template = 'plotly_white'
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,)
fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="rpf", y="cpf", color="tool")
fig.update_xaxes(title='Recovered Promoter Fraction')
fig.update_yaxes(title='Correct Peak Fraction')
fig.layout.template = 'plotly_white'
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,)
fig.show()
# plt.savefig(os.path.expanduser('~/data/2022_GSE26320_k4me3') + '/sens.pdf')

In [None]:
df['precision'] = df['cpf']
df['sensitivity'] = df['rpf']
df['f1'] = [2 / (1 / s + 1 / p) for s, p in zip(df['sensitivity'], df['precision'])]

In [None]:
import plotly.express as px

fig = px.line(df, x="peaks", y="sensitivity", color="tool")
fig.update_xaxes(title='Peaks')
fig.update_yaxes(title='Sensitivity (Recall)')
fig.layout.template = 'plotly_white'
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,)
fig.show()
# plt.savefig(os.path.expanduser('~/data/2022_GSE26320_k4me3') + '/sens.pdf')

In [None]:
import plotly.express as px

fig = px.line(df, x="peaks", y="precision", color="tool")
fig.update_xaxes(title='Peaks')
fig.update_yaxes(title='Precision')
fig.layout.template = 'plotly_white'
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,)
fig.show()
# plt.savefig(os.path.expanduser('~/data/2022_GSE26320_k4me3') + '/sens.pdf')

In [None]:
import plotly.express as px

fig = px.line(df, x="peaks", y="f1", color="tool")
fig.update_xaxes(title='Peaks')
fig.update_yaxes(title='F1')
fig.layout.template = 'plotly_white'
fig.update_layout(
    autosize=False,
    width=1200,
    height=800,)
fig.show()
# plt.savefig(os.path.expanduser('~/data/2022_GSE26320_k4me3') + '/sens.pdf')