# 2023 Immune dataset

Analyze peak calling quality on tracks with artificially lowered quality either via subsampling or with noise mixture.

Data:
https://www.encodeproject.org/immune-cells/?type=Experiment&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&biosample_ontology.cell_slims=hematopoietic+cell&biosample_ontology.classification=primary+cell&control_type!=*&status!=replaced&status!=revoked&status!=archived&biosample_ontology.system_slims=immune+system&biosample_ontology.system_slims=circulatory+system&config=immune


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file

def d(a, b):
    return a / b if b != 0 else 0

def last_col(file):
    try:
        cols = len(pd.read_csv(file, sep='\t', nrows=1, header=None).columns)
        return pd.read_csv(file, sep='\t', header=None, usecols=[cols - 1])[cols - 1]
    except:
        return np.zeros(0)  # Empty file

def sorted_file(file):
    ts = tempfile.mktemp()
    !cat {file} | sort -k1,1 -k2,2n > {ts}
    return ts

# Load peaks

In [None]:
import re

PATH = os.path.expanduser('~/data/2023_Immune')

IMMUNE_CELLS = ['BCell', 'TCell', 'Monocyte']
M='H3K27ac'
# METHOD = 'mln_control'
METHOD = 'noise'
PATH_BAD_QUALITY = os.path.expanduser(f'~/data/2023_Immune_{METHOD}')

MODIFICATIONS = [M]

def load_peaks_fdr(path, suffix, fdrs):
    df_fdr = pd.DataFrame(columns=['file', 'modification', 'cell', 'fdr', 'peaks', 'avlength'],
                          dtype=object)
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        cell = next((cc for cc in IMMUNE_CELLS if cc in f), None)
        mod = next((m for m in MODIFICATIONS if m in f), None)
        # print(f, fdr, cell, mod)
        if fdr and cell and mod:
            peaks_path = os.path.join(path, f)
            ps, ls = lines(peaks_path), bedl(peaks_path)
            avls = 0 if ps == 0 else sum(ls) / ps
            df_fdr.loc[len(df_fdr)] = (f, mod, cell, fdr, ps, avls)
    return df_fdr

In [None]:
df_fdr_macs2 = load_peaks_fdr(os.path.join(PATH, 'macs2'), '.narrowPeak', ['0.05'])
df_fdr_macs2['file'] = [f'{PATH}/macs2/{f}' for f in df_fdr_macs2['file']]
df_fdr_macs2['tool'] = 'MACS2'
print('MACS2', len(df_fdr_macs2))

df_fdr_macs2broad = load_peaks_fdr(os.path.join(PATH, 'macs2'), '.broadPeak', ['0.1'])
df_fdr_macs2broad['file'] = [f'{PATH}/macs2/{f}' for f in df_fdr_macs2broad['file']]
df_fdr_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_fdr_macs2broad))

df_fdr_sicer = load_peaks_fdr(os.path.join(PATH, 'sicer'), 'summary-FDR', ['0.01'])
df_fdr_sicer['file'] = [f'{PATH}/sicer/{f}' for f in df_fdr_sicer['file']]
df_fdr_sicer['tool'] = 'SICER'
print('SICER', len(df_fdr_sicer))

df_fdr_span = load_peaks_fdr(os.path.join(PATH, 'span'), '.peak', ['0.05'])
df_fdr_span['file'] = [f'{PATH}/span/{f}' for f in df_fdr_span['file']]
df_fdr_span['tool'] = 'SPAN'
print('SPAN', len(df_fdr_span))

df_fdr_peaks = pd.concat([df_fdr_macs2, df_fdr_macs2broad, df_fdr_sicer, df_fdr_span])
df_fdr_peaks.sample(5)

In [None]:
TOOLS = ['MACS2', 'MACS2 broad', 'SICER', 'SPAN', 'Genes']
palette = plt.cm.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.barplot(data=df_fdr_peaks, x='tool', y='peaks', ax=ax,
                        capsize=.2, errwidth=2, edgecolor="black",
                        palette=TOOLS_PALETTE)
ax.xaxis.set_tick_params(rotation=90)
ax.title.set_text(f'{M} Peaks number')
plt.show()

# Bad quality tracks

In [None]:
MLNS = [10, 5, 2, 1]

def load_peaks_mln(path, suffix, mlns):
    df_mln = pd.DataFrame(columns=['file', 'modification', 'cell', 'mln', 'peaks', 'avlength'],
                          dtype=object)
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        mln = next((mln for mln in mlns if f'{mln}mln' in f), None)
        cell = next((cc for cc in IMMUNE_CELLS if cc in f), None)
        mod = next((m for m in MODIFICATIONS if m in f), None)
        if mln and cell and mod:
            peaks_path = os.path.join(path, f)
            ps, ls = lines(peaks_path), bedl(peaks_path)
            avls = 0 if ps == 0 else sum(ls) / ps
            df_mln.loc[len(df_mln)] = (f, mod, cell, mln, ps, avls)
    return df_mln

In [None]:
df_mln_macs2 = load_peaks_mln(os.path.join(PATH_BAD_QUALITY, 'macs2'), '.narrowPeak', MLNS)
df_mln_macs2['file'] = [f'{PATH_BAD_QUALITY}/macs2/{f}' for f in df_mln_macs2['file']]
df_mln_macs2['tool'] = 'MACS2'
print('MACS2', len(df_mln_macs2))

df_mln_macs2broad = load_peaks_mln(os.path.join(PATH_BAD_QUALITY, 'macs2'), '.broadPeak', MLNS)
df_mln_macs2broad['file'] = [f'{PATH_BAD_QUALITY}/macs2/{f}' for f in df_mln_macs2broad['file']]
df_mln_macs2broad['tool'] = 'MACS2 broad'
print('MACS2 broad', len(df_mln_macs2broad))

df_mln_sicer = load_peaks_mln(os.path.join(PATH_BAD_QUALITY, 'sicer'), 'summary-FDR', MLNS)
df_mln_sicer['file'] = [f'{PATH_BAD_QUALITY}/sicer/{f}' for f in df_mln_sicer['file']]
df_mln_sicer['tool'] = 'SICER'
print('SICER', len(df_mln_sicer))

df_mln_span = load_peaks_mln(os.path.join(PATH_BAD_QUALITY, 'span'), '.peak', MLNS)
df_mln_span['file'] = [f'{PATH_BAD_QUALITY}/span/{f}' for f in df_mln_span['file']]
df_mln_span['tool'] = 'SPAN'
print('SPAN', len(df_mln_span))

In [None]:
t = df_fdr_peaks.copy().rename(dict(fdr='mln'), axis=1)
t['mln'] = 20
df_mln_peaks = pd.concat([df_mln_macs2, df_mln_macs2broad, df_mln_sicer, df_mln_span, t]).reset_index(drop=True)
df_mln_peaks.sort_values(by=['tool', 'mln'], inplace=True)
df_mln_peaks.sample(5)

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.barplot(data=df_mln_peaks, x='mln', y='peaks', hue='tool', ax=ax,
                        capsize=.2, errwidth=2, edgecolor="black",
                        palette=TOOLS_PALETTE)
ax.title.set_text(f'{M} {METHOD} Peaks number')
ax.set_ylabel('Peaks number')
# ax.xaxis.set_tick_params(rotation=90)
# ax.set_ylim([0, 5e4]) # Limited for visual aesthetics
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
g_results = sns.barplot(data=df_mln_peaks, x='mln', y='avlength', hue='tool', ax=ax,
                        capsize=.2, errwidth=2, edgecolor="black",
                        palette=TOOLS_PALETTE)
ax.title.set_text(f'{M} {METHOD} Average length')
ax.set_ylabel('Average length')
# ax.xaxis.set_tick_params(rotation=90)
# ax.set_ylim([0, 5e4]) # Limited for visual aesthetics
plt.tight_layout()
plt.show()

In [None]:
import tempfile
from itertools import product

TOOLS = list(sorted(df_fdr_peaks['tool'].unique()))

mln_bench_df = pd.DataFrame(
    columns=['cell', 'name', 'full_peaks', 'full_peaks_len',
             'mln', 'mln_peaks', 'mln_peaks_len',
             'full_overlap', 'mln_overlap',
             'overlap_len'],
    dtype=object
)

tf = tempfile.mktemp()

for c, tool in tqdm(product(IMMUNE_CELLS, TOOLS)):
    print(c, tool)
    t = df_mln_peaks[(df_mln_peaks['tool'] == tool) &
                     (df_mln_peaks['modification'] == M) &
                     (df_mln_peaks['cell'] == c) &
                     (df_mln_peaks['mln'] == 20)]
    if len(t) == 0:
        continue
    full_file = sorted_file(t['file'].values[0])
    full_peaks = lines(full_file)
    full_peaks_len = int(bedl(full_file).sum())
    for mln in MLNS:
        t = df_mln_peaks[(df_mln_peaks['tool'] == tool) &
                         (df_mln_peaks['modification'] == M) &
                         (df_mln_peaks['cell'] == c) &
                         (df_mln_peaks['mln'] == mln)]
        if len(t) == 0:
            continue
        mln_file = sorted_file(t['file'].values[0])
        mln_peaks = lines(mln_file)
        mln_peaks_len = int(bedl(mln_file).sum())
        !bedtools intersect -a {full_file} -b {mln_file} -wa -u > {tf}
        full_peaks_overlap = lines(tf)
        !bedtools intersect -b {full_file} -a {mln_file} -wa -u > {tf}
        mln_peaks_overlap = lines(tf)
        !bedtools intersect -a {full_file} -b {mln_file} -wo > {tf}
        overlap_len = int(last_col(tf).sum())
        mln_bench_df.loc[len(mln_bench_df)] = \
            (c, tool,
             full_peaks, full_peaks_len,
             mln, mln_peaks, mln_peaks_len,
             full_peaks_overlap, mln_peaks_overlap,
             overlap_len)

display(mln_bench_df.sample())

In [None]:
mln_bench_df['precision'] = [d(a, b) for a, b in zip(mln_bench_df['mln_overlap'], mln_bench_df['mln_peaks'])]
mln_bench_df['sensitivity'] = [d(a, b) for a, b in zip(mln_bench_df['full_overlap'], mln_bench_df['full_peaks'])]
mln_bench_df['f1'] = [d(2, d(1, s + 1e-10) + d(1, p + 1e-10))
                  for s, p in zip(mln_bench_df['sensitivity'], mln_bench_df['precision'])]

mln_bench_df['precision_len'] = [d(a, b) for a, b in zip(mln_bench_df['overlap_len'], mln_bench_df['mln_peaks_len'])]
mln_bench_df['sensitivity_len'] = [d(a, b) for a, b in zip(mln_bench_df['overlap_len'], mln_bench_df['full_peaks_len'])]
mln_bench_df['f1_len'] = [d(2, d(1, s + 1e-10) + d(1, p + 1e-10))
                      for s, p in zip(mln_bench_df['sensitivity_len'], mln_bench_df['precision_len'])]

mln_bench_df['jaccard'] = [
    d(o, p + g -o)
    for p, g, o in zip(mln_bench_df['full_peaks_len'], mln_bench_df['mln_peaks_len'], mln_bench_df['overlap_len'])
]

In [None]:
import seaborn as sns

plt.figure(figsize=(8, 3))
axs = [plt.subplot(1, 2, i + 1) for i in range(2)]

ax = axs[0]
g_results = sns.scatterplot(data=mln_bench_df, x='precision', y='sensitivity', hue='name', style='mln',
                            palette=TOOLS_PALETTE, ax=ax)
g_results.axes.set_xlabel('Mln vs full (peaks)')
g_results.axes.set_ylabel('Full vs mln (peaks)')
g_results.axes.set_xlim([-0.1, 1.1])
g_results.axes.set_ylim([-0.1, 1.1])
ax.legend().set_visible(False)

ax = axs[1]
g_results = sns.scatterplot(data=mln_bench_df, x='precision_len', y='sensitivity_len', hue='name', style='mln',
                            palette=TOOLS_PALETTE, ax=ax)
g_results.axes.set_xlabel('Mln vs full (length)')
g_results.axes.set_ylabel('Full vs mln (length)')
g_results.axes.set_xlim([-0.1, 1.1])
g_results.axes.set_ylim([-0.1, 1.1])
# Put a legend to the right of the current axis
g_results.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.show()

In [None]:
ts = []
for name in mln_bench_df['name'].unique():
    tn = mln_bench_df[mln_bench_df['name'] == name]
    for variable in ['sensitivity', 'precision', 'f1']:
        ts.append(pd.DataFrame(dict(name=[name] * len(tn), type=[variable] * len(tn),
                                    benchmark=['peak'] * len(tn),
                                    mln=tn['mln'],
                                    value=tn[variable])))
        ts.append(pd.DataFrame(dict(name=[name] * len(tn), type=[variable] * len(tn),
                                    benchmark=['length'] * len(tn),
                                    mln=tn['mln'],
                                    value=tn[f'{variable}_len'])))

t = pd.concat(ts).reset_index(drop=True)
del ts

In [None]:
plt.figure(figsize=(12, 6))
axs = [plt.subplot(2, len(MLNS), i + 1) for i in range(2 * len(MLNS))]
for i, (benchmark, mln) in enumerate(product(['peak', 'length'], MLNS)):
    g_results = sns.boxplot(data=t[(t['benchmark'] == benchmark) & (t['mln'] == mln)],
                            x='type', y='value', hue='name', palette=TOOLS_PALETTE, ax=axs[i])
    ax = g_results.axes
    ax.set_title(f'{mln}mln vs full ({benchmark})')
    # Put a legend to the right of the current axis
    if i == len(MLNS) - 1:
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    else:
        ax.legend().set_visible(False)
    ax.set_ylim([-0.1, 1.1])

plt.tight_layout()
plt.show()