# Immgen ATAC-Seq


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm.auto import tqdm
import re
import tempfile

In [None]:
PATH = os.path.expanduser('~/data/2024_Immgen')
! mkdir -p {PATH}/pics

def file_to_name(file):
    return re.sub('(.*SRR[0-9]+_)|(_ATAC_seq.*)', '', os.path.basename(file))


def file_to_srr(file):
    return re.sub('_.*', '', os.path.basename(file))

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0)  # Empty file


def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0  # Empty file


def d(a, b):
    return a / b if b != 0 else 0


def sorted_file(file):
    ts = tempfile.mktemp()
    !cat {file} | sort -k1,1 -k2,2n > {ts}
    return ts


def last_col(file):
    try:
        cols = len(pd.read_csv(file, sep='\t', nrows=1, header=None).columns)
        return pd.read_csv(file, sep='\t', header=None, usecols=[cols - 1])[cols - 1]
    except:
        return np.zeros(0)  # Empty file

# Load peaks

In [None]:
TOOLS = ['MACS2', 'MACS2 broad', 'SPAN', 'SICER']
palette = plt.get_cmap('tab10')
TOOLS_PALETTE = {t: palette(i) for i, t in enumerate(TOOLS)}

In [None]:
def atac_load_peaks(path, suffix):
    df = pd.DataFrame(columns=['file', 'peaks'], dtype=object)
    for f in tqdm(os.listdir(path)):
        if not f.endswith(suffix):
            continue
        file = os.path.join(path, f)
        ps = lines(file)
        df.loc[len(df)] = (file, ps)
    return df


def atac_load_peaks_path(path):
    df_macs2 = atac_load_peaks(os.path.join(path, 'macs2'), '.narrowPeak')
    df_macs2['tool'] = 'MACS2'
    print('MACS2', len(df_macs2))

    df_macs2_broad = atac_load_peaks(os.path.join(path, 'macs2'), '.broadPeak')
    df_macs2_broad['tool'] = 'MACS2 broad'
    print('MACS2', len(df_macs2_broad))

    df_sicer = atac_load_peaks(os.path.join(path, 'sicer'), '.scoreisland')
    df_sicer['tool'] = 'SICER'
    print('SICER', len(df_sicer))

    df_span = atac_load_peaks(os.path.join(path, 'span'), '.peak')
    df_span['tool'] = 'SPAN'
    print('SPAN', len(df_span))

    return pd.concat([df_macs2, df_macs2_broad, df_sicer, df_span]).reset_index(drop=True)

In [None]:
dfa = atac_load_peaks_path(PATH)
dfa['name'] = [file_to_name(f) for f in dfa['file']]
dfa['srr'] = [file_to_srr(f) for f in dfa['file']]
dfa.sort_values(by='name', inplace=True)
print(list(sorted(dfa['name'].unique())))
dfa.sample(3)

In [None]:
plt.figure(figsize=(3, 4))
ax = plt.axes()
sns.barplot(data=dfa, x='tool', y='peaks',
            order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            palette=TOOLS_PALETTE,
            errorbar='sd', capsize=.1,
            err_kws={'linewidth': 2},
            ax=ax)
plt.xticks(rotation=90)
plt.title('Peaks number')
plt.tight_layout()
plt.tight_layout()
plt.savefig(f'{PATH}/pics/peaks.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(25, 6))
g_result = sns.barplot(data=dfa, x='name', y='peaks', hue='tool',
                       hue_order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                       palette=TOOLS_PALETTE,
                       errorbar='sd', capsize=.05,
                       err_kws={'linewidth': 2}
                       )
g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(rotation=90)
plt.title('Peaks number')
plt.tight_layout()
plt.tight_layout()
plt.savefig(f'{PATH}/pics/peaks.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
dfa[dfa['tool'] == 'MACS2 broad'].groupby(['name'])['file'].count().astype(int)

In [None]:
ts = []
for srr, name, tool, file in tqdm(zip(dfa['srr'], dfa['name'], dfa['tool'], dfa['file'])):
    lengths = bedl(file)
    t = pd.DataFrame(dict(length=lengths))
    # Some tracks may have open chromatin clusters, which significantly affects average, 
    # so pick only lengths limited to 80%
    t.sort_values(by=['length'], inplace=True)
    t = t.iloc[0:int(len(t) * 0.8)].copy()
    t = t.sample(min(len(t), 10_000)).copy()
    t['srr'] = srr
    t['name'] = name
    t['tool'] = tool
    ts.append(t)
df_lens = pd.concat(ts).reset_index(drop=True)
del ts
df_lens.sample(10)

In [None]:
plt.figure(figsize=(25, 6))
g_result = sns.boxplot(data=df_lens, x='name', y='length', hue='tool',
                       hue_order=['MACS2', 'MACS2 broad', 'SPAN', 'SICER'],
                       palette=TOOLS_PALETTE,
                       showfliers=False)
plt.xticks(rotation=90)
plt.title('Peaks length')
g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
# plt.savefig(f'{PATH}/pics/peaks_length.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(25, 6))
g_result = sns.boxplot(data=df_lens[df_lens['tool'] != 'SICER'], x='name', y='length', hue='tool',
                       hue_order=['MACS2', 'MACS2 broad', 'SPAN'],
                       palette=TOOLS_PALETTE,
                       showfliers=False)
plt.xticks(rotation=90)
plt.title('Peaks length')
g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
# plt.savefig(f'{PATH}/pics/peaks_length.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
ax = plt.axes()
g_results = sns.histplot(data=df_lens, x='length', hue='tool', ax=ax,
                         hue_order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                         palette=TOOLS_PALETTE,
                         stat='density', common_bins=False, common_norm=False,
                         bins=50, kde=True, log_scale=True, alpha=0.2)
g_results.set(xscale='log')
g_results.set_ylim(0, 5)
g_results.set_xlim(50, 8e3)
ax.title.set_text('ATAC-seq Peaks length')
plt.tight_layout()
plt.savefig(f'{PATH}/pics/peaks_length2.pdf', bbox_inches='tight', dpi=300)
plt.show()

# Overlaps

In [None]:
! mkdir {PATH}/overlaps
import pyranges as pr


def compute_overlaps(df):
    dfoverlap = pd.DataFrame(columns=['name', 'tool', 'file1', 'file2',
                                      'peaks1', 'peaks2', 'overlap12', 'overlap21',
                                      'peaks1_len', 'peaks2_len', 'overlap_len'], dtype=object)
    for (name, tool), dft in tqdm(list(df.groupby(['name', 'tool']))):
        print('Processing', name, tool, len(dft))
        if len(dft) < 2:
            continue
        files = list(dft['file'])
        ranges = [None] * len(files)
        peaks = [-1] * len(files)
        peaks_lens = [-1] * len(files)
        # print('Load')
        for i, file in enumerate(files):
            peaks[i] = lines(file)
            if peaks[i] == 0:
                peaks_lens[i] = 0
                continue
            ranges[i] = pr.read_bed(sorted_file(file))
            peaks_lens[i] = ranges[i].lengths().sum()
        for i, j in product(range(len(ranges)), range(len(ranges))):
            if i >= j:
                continue
            file1, file2 = files[i], files[j]
            ranges1, ranges2 = ranges[i], ranges[j]
            peaks1, peaks2 = peaks[i], peaks[j]
            peaks1_len, peaks2_len = peaks_lens[i], peaks_lens[j]
            if peaks1 <= 0 or peaks2 <= 0:
                dfoverlap.loc[len(dfoverlap)] = (name, tool, file1, file2,
                                                 peaks1, peaks2, 0, 0,
                                                 peaks1_len, peaks2_len, 0)
                continue
            overlap1 = len(ranges1.overlap(ranges2))
            overlap2 = len(ranges2.overlap(ranges1))
            overlap_len = ranges1.intersect(ranges2).lengths().sum()
            dfoverlap.loc[len(dfoverlap)] = (name, tool, file1, file2,
                                             peaks1, peaks2, overlap1, overlap2,
                                             peaks1_len, peaks2_len, overlap_len)
    return dfoverlap

In [None]:
df_overlap = compute_overlaps(dfa)

In [None]:
df_overlap['jaccard'] = [
    d(lo, l1 + l2 - lo)
    for l1, l2, lo in zip(df_overlap['peaks1_len'], df_overlap['peaks2_len'], df_overlap['overlap_len'])
]
df_overlap.sort_values(by='name', inplace=True)

In [None]:
print('Jaccard')

plt.figure(figsize=(3, 4))
g_result = sns.barplot(data=df_overlap,
                       x='tool', y='jaccard',
                       order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
                       hue='tool', legend=False,
                       palette=TOOLS_PALETTE,
                       errorbar='sd', capsize=.05,
                       err_kws={'linewidth': 2}
                       )
plt.xticks(rotation=90)
plt.title('Jaccard replicates')
# g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig(f'{PATH}/pics/jaccard.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
print('Jaccard')

plt.figure(figsize=(25, 6))
ax = plt.axes()
sns.barplot(data=df_overlap,
            x='name', y='jaccard',
            hue='tool',
            hue_order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            palette=TOOLS_PALETTE,
            errorbar='sd', capsize=.05,
            err_kws={'linewidth': 2},
            ax=ax)
plt.xticks(rotation=90)
plt.title('Jaccard for cell type')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig(f'{PATH}/pics/jaccard.pdf', bbox_inches='tight', dpi=300)
plt.show()

# Difference

In [None]:
diff_bench_df = pd.DataFrame(
    columns=['cell', 'srr', 'name', 'peaks'],
    dtype=object
)

tf = tempfile.mktemp()

for srr in tqdm(dfa['srr'].unique()):
    print(srr)
    t = dfa[(dfa['tool'] == 'SPAN') & (dfa['srr'] == srr)]
    if len(t) == 0:
        continue
    span_file = sorted_file(t['file'].values[0])
    cell_name = t['name'].values[0]
    # Processing single tools information
    for tool in dfa['tool'].unique():
        if tool == 'SPAN':
            continue
        t = dfa[(dfa['tool'] == tool) & (dfa['srr'] == srr)]
        if len(t) == 0:
            continue
        peaks_file = sorted_file(t['file'].values[0])
        for name, args in [
            (f'SPAN - {tool}', f' -a {span_file} -b {peaks_file} '),
            (f'{tool} - SPAN', f' -b {span_file} -a {peaks_file} ')]:
            !echo '' > {tf}
            !bedtools intersect {args} -wa -v > {tf}
            peaks = lines(tf)
            diff_bench_df.loc[len(diff_bench_df)] = (cell_name, srr, name, peaks)
diff_bench_df

In [None]:
print('Diff')

plt.figure(figsize=(3, 4))
g_result = sns.barplot(data=diff_bench_df,
                       x='name', y='peaks',
                       # order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER', 'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN'],
                       order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'MACS2 - SPAN', 'MACS2 broad - SPAN'],
                       capsize=.1, err_kws={'linewidth': 2},
                       )
plt.xticks(rotation=90)
plt.title('Difference')
# g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig(f'{PATH}/pics/difference.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
print('Diff')

plt.figure(figsize=(25, 6))
sns.barplot(data=diff_bench_df,
            x='cell', y='peaks', hue='name',
            hue_order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER', 'MACS2 - SPAN', 'MACS2 broad - SPAN',
                       'SICER - SPAN'],
            errorbar='sd', capsize=.05, err_kws={'linewidth': 2},
            )
plt.xticks(rotation=90)
plt.title('Difference')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig(f'{PATH}/pics/difference.pdf', bbox_inches='tight', dpi=300)
plt.show()

# IDR between replicates

In [None]:
    from tempfile import mktemp


def compute_idrs(df):
    dfidr = pd.DataFrame(columns=['name', 'tool', 'rep1', 'rep2',
                                  'idr_peaks', 'idr_loaded_peaks', 'idr_percentage'], dtype=object)
    tf = mktemp()
    for (name, tool), dft in tqdm(list(df.groupby(['name', 'tool']))):
        print('Processing', name, tool, len(dft))
        files = list(dft['file'])
        for i1, i2 in product(range(len(files)), range(len(files))):
            if i1 >= i2:
                continue
            f1, f2 = sorted_file(files[i1]), sorted_file(files[i2])
            # Process SICER peaks, add missing 8th column, as -log10 qvalue
            if tool == 'SICER':
                sf1 = pd.read_csv(f1, sep='\t', header=None)
                sf1[8] = -np.log10(sf1[7])
                sf1.to_csv(f1, sep='\t', index=None, header=False)
                sf2 = pd.read_csv(f2, sep='\t', header=None)
                sf2[8] = -np.log10(sf2[7])
                sf2.to_csv(f2, sep='\t', index=None, header=False)
            ! idr --input-file-type bed --rank 8 --samples {f1} {f2} 2>&1 | tee {tf}
            with open(tf) as f:
                idr_out = f.read()
            ip, ilp, iperc = \
            re.findall('Number of peaks passing IDR cutoff of 0.05 - (\\d+)/(\\d+) \\(([\\d\\.]+)', idr_out)[0]
            ip, ilp, iperc = int(ip), int(ilp), float(iperc)
            dfidr.loc[len(dfidr)] = (name, tool, i1, i2, ip, ilp, iperc)
    return dfidr

In [None]:
df_idr = compute_idrs(dfa)

In [None]:
print('IDR')

plt.figure(figsize=(14, 6))
ax = plt.axes()
sns.barplot(data=df_idr,
            x='name', y='idr_peaks', hue='tool',
            errorbar='sd', capsize=.05, errwidth=2,
            ax=ax)
plt.xticks(rotation=90)
plt.title('IDR for cell type')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.savefig(f'{PATH}/pics/idr.pdf', bbox_inches='tight', dpi=300)
plt.show()

# Overlap with DHS

In [None]:
from itertools import product

DHS_PATH = f'{PATH}/ENCFF754WCT_mm10_dhs_representative_sites.bed'
dhs_path_sorted = sorted_file(DHS_PATH)
DHS_PEAKS = lines(DHS_PATH)


def compute_dhs_overlaps(df):
    dfoverlap = pd.DataFrame(columns=['srr', 'name', 'tool', 'file', 'way', 'peaks', 'overlap'], dtype=object)
    for (srr, name, tool), dft in tqdm(list(df.groupby(['srr', 'name', 'tool']))):
        print('Processing', srr, name, tool, len(dft))
        for _, row in dft.iterrows():
            file, peaks = sorted_file(row['file']), row['peaks']

            tf = f'{PATH}/overlaps/overlaps_{tool}_{name}_{tool}_vs_dhs.bed'.replace(' ', '_')
            !bedtools intersect -a {file} -b {dhs_path_sorted} -wa -u > {tf}
            owd = lines(tf)
            dfoverlap.loc[len(dfoverlap)] = (srr, name, tool, file, 'with_dhs', peaks, owd)

            tf = f'{PATH}/overlaps/overlaps_{tool}_{name}_{tool}_dhs_vs.bed'.replace(' ', '_')
            !bedtools intersect -b {file} -a {dhs_path_sorted} -wa -u > {tf}
            odw = lines(tf)
            dfoverlap.loc[len(dfoverlap)] = (srr, name, tool, file, 'dhs_with', DHS_PEAKS, odw)
    return dfoverlap

In [None]:
dhs_overlap = compute_dhs_overlaps(dfa[dfa['tool'] != 'SICER'])
dhs_overlap.sample(3)

In [None]:
dhs_overlap['overlap_share'] = [o / p if p > 0 else 0 for o, p in zip(dhs_overlap['overlap'], dhs_overlap['peaks'])]
dhs_overlap.sample(3)

In [None]:
plt.figure(figsize=(25, 6))
# Plot 
ax = plt.subplot(1, 2, 1)
ax.title.set_text('Overlap with DHS')
sns.barplot(data=dhs_overlap[dhs_overlap['way'] == 'with_dhs'], x='name', y='overlap_share', hue='tool',
            hue_order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            palette=TOOLS_PALETTE,
            errorbar='sd', capsize=.05, err_kws={'linewidth': 2}, ax=ax)
ax.xaxis.set_tick_params(rotation=90)
ax.set_xlabel('Cell')
ax.set_ylabel('Fraction')
ax.legend(loc='lower left', title='tool')
ax.legend().set_visible(False)

ax = plt.subplot(1, 2, 2)
ax.title.set_text('Overlap DHS with')
t = dhs_overlap[dhs_overlap['way'] == 'dhs_with'].copy()
t['overlap'].clip(upper=0.15, inplace=True)
sns.barplot(data=t, x='name', y='overlap_share', hue='tool',
            hue_order=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'],
            palette=TOOLS_PALETTE,
            errorbar='sd', capsize=.05, err_kws={'linewidth': 2}, ax=ax)
ax.xaxis.set_tick_params(rotation=90)
ax.set_xlabel('Cell')
ax.set_ylabel('Fraction')
ax.legend(loc='lower left', title='tool')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.savefig(f'{PATH}/pics/dhs.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
t = pd.pivot_table(dhs_overlap[['name', 'tool', 'way', 'overlap_share']],
                   values='overlap_share', columns=['way'], index=['name', 'tool']).reset_index()

plt.figure(figsize=(5, 3.5))
ax = plt.axes()
sns.kdeplot(t,
            hue='tool',
            hue_order=['MACS2', 'MACS2 broad', 'SPAN'],
            palette=TOOLS_PALETTE,
            x='with_dhs', y='dhs_with',
            alpha=0.1,
            thresh=0.01,
            fill=True,
            legend=False,
            ax=ax)
sns.scatterplot(
    t, x='with_dhs', y='dhs_with',
    hue='tool',
    hue_order=['MACS2', 'MACS2 broad', 'SPAN'],
    palette=TOOLS_PALETTE,
    ax=ax)
ax.set_xlabel('Precision (overlap peaks with DHS)')
ax.set_ylabel('Sensitivity (overlap DHS with peaks)')
ax.title.set_text('Overlap vs DHS')
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

# Top peaks vs DHS

In [None]:
import tempfile


def compute_dhs_overlaps_n(df):
    tf = tempfile.mktemp()
    tf2 = tempfile.mktemp()
    rows = []
    for (srr, n, tool), dft in tqdm(list(df.groupby(['srr', 'name', 'tool']))):
        print('Processing', n, tool, len(dft))
        for _, row in dft.iterrows():
            peaks_file, peaks = sorted_file(row['file']), row['peaks']
            if peaks == 0:
                for top in np.linspace(1000, 30000, 5):
                    rows.append((srr, n, tool, top, DHS_PEAKS, peaks_file, 0, 0, 0))
                continue
            t = pd.read_csv(peaks_file, sep='\t', header=None)
            # if len(t.columns) < 9:
            #     display(t.head())
            #     return 
            t.sort_values(by=[8] if len(t.columns) >= 9 else [3], ascending=False, inplace=True)
            for top in np.linspace(1000, 30000, 5):
                t.head(int(top)).sort_values(by=[0, 1]).to_csv(tf, sep='\t', index=False, header=None)
                tf = sorted_file(tf)
                peaks = lines(tf)
                ! bedtools intersect -a {tf} -b {dhs_path_sorted} -wa -u > {tf2}
                peaks_overlap = lines(tf2)
                ! bedtools intersect -b {tf} -a {dhs_path_sorted} -wa -u > {tf2}
                dhs_overlap = lines(tf2)
                rows.append((srr, n, tool, top, DHS_PEAKS, peaks_file, peaks, peaks_overlap, dhs_overlap))

    df = pd.DataFrame(
        rows,
        columns=['srr', 'name', 'tool', 'top', 'dhs_peaks', 'peaks_file', 'peaks', 'peaks_overlap', 'dhs_overlap'],
        dtype=object
    )
    return df

In [None]:
# dhs_ovlp_df = compute_dhs_overlaps_n(dfa[(dfa['tool'] != 'SICER') & (dfa['srr'] == 'SRR5799505')])  # Sample for testing
dhs_ovlp_df = compute_dhs_overlaps_n(dfa[(dfa['tool'] != 'SICER')])  # Sample for testing
dhs_ovlp_df.sample(3)

In [None]:
dhs_ovlp_df['p'] = (dhs_ovlp_df['peaks_overlap'] + dhs_ovlp_df['dhs_overlap']) / 2
dhs_ovlp_df['precision'] = [d(x, y) for x, y in zip(dhs_ovlp_df['peaks_overlap'], dhs_ovlp_df['peaks'])]
dhs_ovlp_df['sensitivity'] = [d(x, y) for x, y in zip(dhs_ovlp_df['dhs_overlap'], dhs_ovlp_df['dhs_peaks'])]
dhs_ovlp_df['f1'] = [
    2 / (d(1, s + 1e-10) + d(1, p + 1e-10))
    for s, p in zip(dhs_ovlp_df['sensitivity'], dhs_ovlp_df['precision'])]

In [None]:
dhs_ovlp_df.sample(3)

In [None]:
plt.figure(figsize=(4, 4))
for (srr, n, tool), dft in dhs_ovlp_df.groupby(['srr', 'name', 'tool']):
    print(srr, n, tool)
    plt.plot(dft['precision'], dft['sensitivity'], marker='.',
             color=TOOLS_PALETTE[tool], alpha=0.8)
plt.title('Peaks vs DHS')
plt.xlabel('Precision (Peaks fraction)')
plt.ylabel('Sensitivity (DHS fraction)')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(4, 4))
ax = plt.axes()
# Brain microglia
for (srr, n, tool), dft in dhs_ovlp_df[dhs_ovlp_df['srr'] == 'SRR5799505'].groupby(['srr', 'name', 'tool']):
    print(srr, n, tool)
    plt.plot(dft['precision'], dft['sensitivity'], marker='.', color=TOOLS_PALETTE[tool], alpha=0.8)
    for x, y, t in zip(dft['precision'], dft['sensitivity'], dft['top']):
        ax.text(x, y, str(int(t)), fontsize=7)
plt.title('Peaks vs DHS')
plt.xlabel('Precision (Peaks fraction)')
plt.ylabel('Sensitivity (DHS fraction)')
plt.tight_layout()
plt.show()

In [None]:
print('ENCODE + Roadmap')
plt.figure(figsize=(5, 3.5))
ax = plt.axes()
sns.lineplot(
    data=dhs_ovlp_df[(dhs_ovlp_df['srr'] == 'SRR5799505') & (dhs_ovlp_df['top'] >= 5000)],
    x='precision', y='sensitivity',
    hue='tool', marker='o',
    sort=False,
    palette=TOOLS_PALETTE,
    ax=ax
)

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.title('Peaks vs DHS')
plt.xlabel('Precision (overlap peaks with DHS)')
plt.ylabel('Sensitivity (overlap DHS with peaks)')

plt.tight_layout()
plt.show()

In [None]:
rows = []
for (srr, n, tool), dft in dhs_ovlp_df.groupby(['srr', 'name', 'tool']):
    dft.sort_values(by=['top'], inplace=True)
    ap = 0
    sprev, pprev = 0, 0
    for _, row in dft.iterrows():
        ap += (row['sensitivity'] - sprev) * (row['precision'] + pprev) / 2
        sprev, pprev = row['sensitivity'], row['precision']
    rows.append((srr, n, tool, ap))
dhs_ovlp_df_df = pd.DataFrame(rows, columns=['srr', 'name', 'tool', 'auc'])
del rows
dhs_ovlp_df_df.sample(2)

In [None]:
plt.figure(figsize=(3, 4))
ax = plt.axes()
ax.title.set_text('Peaks vs DHS AUC')
t = dhs_ovlp_df_df[dhs_ovlp_df_df['tool'] != 'SICER']
g_results = sns.barplot(data=t, x='tool', y='auc',
                        capsize=.1, err_kws={'linewidth': 2},
                        hue='tool', legend=False,
                        palette=TOOLS_PALETTE,
                        ax=ax)
sns.stripplot(data=t, x='tool', y='auc',
              dodge=True, size=1.5, color="black", alpha=0.5,
              ax=ax)

ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('AUC')
plt.tight_layout()
plt.show()

# Bigwigs

In [None]:
def atac_load_bws(path):
    df = pd.DataFrame(columns=['file', 'srr', 'name'], dtype=object)
    for f in tqdm(os.listdir(path)):
        if not f.endswith('.bw'):
            continue
        file = os.path.join(path, f)
        df.loc[len(df)] = (file, file_to_srr(f), file_to_name(f))
    return df


df_bams = atac_load_bws(PATH + '/bw')
df_bams.sample(3)

In [None]:
import pyBigWig

CHROM_SIZES = {
    c: s for _, (c, s) in pd.read_csv(os.path.join(PATH, 'mm10.chrom.sizes'),
                                      sep='\t', names=['chr', 'size']).iterrows() if '_' not in c
}

total_coverages = {}
ts = []
for _, (file, srr, name) in tqdm(list(df_bams[['file', 'srr', 'name']].iterrows())):
    try:
        with pyBigWig.open(file) as bw:
            total_coverage = sum(
                bw.stats(chr, exact=True, type='sum')[0] for chr in CHROM_SIZES.keys() if '_' not in chr)
            print('Total coverage', total_coverage)
            # Multiplier to align BAM coverage with BigWig estimation
            total_coverage *= 1.1e-2
            total_coverages[(srr, name)] = total_coverage
    except Exception:
        pass

total_coverages

In [None]:
total_coverages_df = pd.DataFrame([(srr, name, reads) for (srr, name), reads in total_coverages.items()],
                                  columns=['srr', 'name', 'reads'])
total_coverages_df.sample(5)

In [None]:
plt.figure(figsize=(14, 6))
g_result = sns.barplot(data=total_coverages_df, x='name', y='reads',
                       errorbar='sd', capsize=.05, err_kws={'linewidth': 2})
# g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(rotation=90)
plt.title('Reads')
plt.tight_layout()
plt.tight_layout()
# plt.savefig(f'{PATH}/pics/peaks.pdf', bbox_inches='tight', dpi=300)
plt.show()

# Difference coverage

In [None]:
tf = tempfile.mktemp()

rows = []
for srr in tqdm(dfa['srr'].unique()):
    # if srr != 'SRR5799505':
    #     continue
    print(srr)
    t = dfa[(dfa['tool'] == 'SPAN') & (dfa['srr'] == srr)]
    if len(t) != 1:
        continue
    span_file = sorted_file(t['file'].values[0])
    cell_name = t['name'].values[0]
    totalm = total_coverages[(srr, cell_name)] * 1e-6
    # Processing single tools information
    bw_path = df_bams[(df_bams['srr'] == srr) & (df_bams['name'] == cell_name)]['file'].values[0]
    with pyBigWig.open(bw_path) as bw:
        for tool in dfa['tool'].unique():
            if tool == 'SPAN':
                continue
            t = dfa[(dfa['tool'] == tool) & (dfa['srr'] == srr)]
            if len(t) != 1:
                continue
            peaks_file = sorted_file(t['file'].values[0])
            for name, args in [
                (f'SPAN - {tool}', f' -a {span_file} -b {peaks_file} '),
                (f'{tool} - SPAN', f' -b {span_file} -a {peaks_file} ')]:
                !echo '' > {tf}
                !bedtools intersect {args} -wa -v > {tf}
                t2 = pd.read_csv(tf, sep='\t', header=None)
                # if len(t.columns) < 9:
                #     display(t.head())
                #     return 
                t2.sort_values(by=[8] if len(t2.columns) >= 9 else [3], ascending=False, inplace=True)
                t2 = t2[~t2[0].str.contains('M|_', regex=True)]
                for _, (chr, start, end) in t2.head(50)[[0, 1, 2]].iterrows():
                    cov = bw.stats(chr, start, end, exact=True, type='sum')[0]
                    rpk = cov / (end - start) * 1e3
                    rpm = cov / totalm
                    rpkm = rpk / totalm
                    rows.append((srr, cell_name, name, len(t2), chr, start, end, end - start, rpk, rpm, rpkm))

diff_bench_coverage_df = pd.DataFrame(
    rows,
    columns=['srr', 'cell', 'name', 'peaks', 'chromosome', 'start', 'end', 'length', 'rpk', 'rpm', 'rpkm'],
    dtype=object
)
diff_bench_coverage_df.sample(3)

In [None]:
print('Diff')

plt.figure(figsize=(3, 4))
sns.barplot(data=diff_bench_coverage_df,
            x='name', y='peaks',
            order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'MACS2 - SPAN', 'MACS2 broad - SPAN'],
            capsize=.1, err_kws={'linewidth': 2})
plt.xticks(rotation=90)
plt.title('Difference')
# g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
# plt.savefig(f'{PATH}/pics/difference.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
print('Diff')

plt.figure(figsize=(3, 4))
sns.barplot(data=diff_bench_coverage_df,
            x='name', y='length',
            order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'MACS2 - SPAN', 'MACS2 broad - SPAN'],
            capsize=.1, err_kws={'linewidth': 2},
            )
plt.xticks(rotation=90)
plt.title('Length of difference')
# g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
# plt.savefig(f'{PATH}/pics/difference.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
print('Diff')

plt.figure(figsize=(3, 4))
g_result = sns.barplot(data=diff_bench_coverage_df,
                       x='name', y='rpkm',
                       # order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'SPAN - SICER', 'MACS2 - SPAN', 'MACS2 broad - SPAN', 'SICER - SPAN'],
                       order=['SPAN - MACS2', 'SPAN - MACS2 broad', 'MACS2 - SPAN', 'MACS2 broad - SPAN'],
                       capsize=.05, err_kws={'linewidth': 2},
                       )
plt.xticks(rotation=90)
plt.title('Coverage of difference')
# g_result.axes.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
# plt.savefig(f'{PATH}/pics/difference.pdf', bbox_inches='tight', dpi=300)
plt.show()