# Immgen PRJNA392905 - ATAC-Seq SPAN, MACS2 replicates consistency

Logbook: https://docs.google.com/document/d/1WxzLWUX0PV2TpD0VfwStHczN6YSQ35Uu6DDR-8WX-io/edit#heading=h.xg4nq1px6lhh

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import pandas as pd
import seaborn as sns
from IPython.display import display

sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
from tqdm.auto import tqdm
import tempfile

In [None]:
PATH = os.path.expanduser('~/data/2022_Immgen')

def file_to_name(file):
    return re.sub('(.*SRR[0-9]+_)|(_ATAC_seq.*)', '', os.path.basename(file))

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0) # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

def d(a, b):
    return a / b if b != 0 else 0

def sorted_file(file):
    ts = tempfile.mktemp()
    !cat {file} | sort -k1,1 -k2,2n > {ts}
    return ts

def last_col(file):
    try:
        cols = len(pd.read_csv(file, sep='\t', nrows=1, header=None).columns)
        return pd.read_csv(file, sep='\t', header=None, usecols=[cols - 1])[cols - 1]
    except:
        return np.zeros(0)  # Empty file

# Macs2

In [None]:
MACS2_FOLDER = PATH + '/macs2atac'
LEVELS = ['q0.05']

dfma = pd.DataFrame(columns=['name', 'tool', 'file', 'peaks'], dtype=object)

for file in tqdm(glob.glob(MACS2_FOLDER + '/*.narrowPeak')):
    tool = next((l for l in LEVELS if f'_{l}_' in file), None)
    if tool:
        name = file_to_name(file)
        dfma.loc[len(dfma)] = (name, f'MACS2 ATAC {tool}', file, lines(file))

dfma['peaks'] = dfma['peaks'].astype(int)

In [None]:
MACS2_FOLDER = PATH + '/macs2'
LEVELS = ['q0.05']

dfm = pd.DataFrame(columns=['name', 'tool', 'file', 'peaks'], dtype=object)

for file in tqdm(glob.glob(MACS2_FOLDER + '/*.narrowPeak')):
    tool = next((l for l in LEVELS if f'_{l}_' in file), None)
    if tool:
        name = file_to_name(file)
        dfm.loc[len(dfm)] = (name, f'MACS2 {tool}', file, lines(file))

dfm['peaks'] = dfm['peaks'].astype(int)

# SPAN

In [None]:
SPAN_FOLDER=PATH + '/span'
FDRS = [0.05]

dfs = pd.DataFrame(columns=['name', 'tool', 'file', 'peaks'], dtype=object)

for file in tqdm(glob.glob(SPAN_FOLDER + '/*00*.peak')):
    fdr = next((f for f in FDRS if f'_q{f}' in file), None)
    if fdr:
        name = file_to_name(file)
        dfs.loc[len(dfs)] = (name, f'SPAN {fdr}', file, lines(file))

dfs['peaks'] = dfs['peaks'].astype(int)

In [None]:
SPAN_FOLDER=PATH + '/span_atac'
FDRS = [0.05]

dfsa = pd.DataFrame(columns=['name', 'tool', 'file', 'peaks'], dtype=object)

for file in tqdm(glob.glob(SPAN_FOLDER + '/*00*.peak')):
    fdr = next((f for f in FDRS if f'_q{f}' in file), None)
    if fdr:
        name = file_to_name(file)
        dfsa.loc[len(dfsa)] = (name, f'SPAN ATAC {fdr}', file, lines(file))

dfsa['peaks'] = dfsa['peaks'].astype(int)

In [None]:
dfa = pd.concat([dfma, dfm, dfs, dfsa])
plt.figure(figsize=(3, 4))
sns.barplot(data=dfa, x='name', y='peaks', hue='tool',
            ci='sd', capsize=.05, errwidth=2)
plt.xticks(rotation=90)
plt.title('Peaks number')
plt.tight_layout()
plt.show()

In [None]:
t = (dfm.groupby(['name'])['file'].count() / len(LEVELS)).astype(int)
plt.figure(figsize=(1, 3))
sns.barplot(data=pd.DataFrame(dict(name=t.index, replicates=t)), x='name', y='replicates')
plt.xticks(rotation=90)
plt.title('Dataset size')
plt.tight_layout()
plt.show()

In [None]:
ts = []
for name, tool, file in tqdm(zip(dfa['name'], dfa['tool'], dfa['file'])):
    lengths = bedl(file)
    t = pd.DataFrame(dict(name=[name] * len(lengths), tool=[tool] * len(lengths), length=lengths))
    ts.append(t.sample(min(len(t), 10_000)))
t = pd.concat(ts).reset_index(drop=True)
del ts
t.sample(10)

In [None]:
plt.figure(figsize=(3, 4))
sns.boxplot(data=t, x='name', y='length', hue='tool', showfliers=False)
plt.xticks(rotation=90)
plt.title('Peaks length')
plt.tight_layout()
plt.show()

# Overlaps

In [None]:
def compute_overlaps(df):
    dfoverlap = pd.DataFrame(columns=['name', 'tool', 'file1', 'file2',
                                      'peaks1', 'peaks2', 'overlap12', 'overlap21',
                                      'peaks1_len', 'peaks2_len', 'overlap_len'], dtype=object)
    for n in sorted(set(df['name'])):
        for tool in sorted(set(df['tool'])):
            dfnl = df.loc[(df['name'] == n) &  (df['tool'] == tool)]
            print('Processing', n, tool, len(dfnl))
            files = list(dfnl['file'])
            for i1, i2 in tqdm(product(range(len(files)), range(len(files)))):
                if i1 >= i2:
                    continue
                f1, f2 = files[i1], files[i2]
                peaks1, peaks1_len = lines(f1), bedl(f1).sum()
                peaks2, peaks2_len = lines(f2), bedl(f2).sum()
                f1s, f2s = sorted_file(f1), sorted_file(f2)
                tf = f'{PATH}/overlaps_{tool}_{n}_{tool}_overlaps.bed'.replace(' ', '_')
                !bedtools intersect -a {f1s} -b {f2s} -wa -u > {tf}
                overlap12 = lines(tf)
                !bedtools intersect -b {f1s} -a {f2s} -wa -u > {tf}
                overlap21 = lines(tf)
                !bedtools intersect -a {f1s} -b {f2s} -wo > {tf}
                overlap_len = int(last_col(tf).sum())
                dfoverlap.loc[len(dfoverlap)] = (n, tool, f1, f2,
                                                 peaks1, peaks2, overlap12, overlap21,
                                                 peaks1_len, peaks2_len, overlap_len)
    return dfoverlap

## MACS2 overlaps

In [None]:
dfma_overlap = compute_overlaps(dfma)

In [None]:
dfm_overlap = compute_overlaps(dfm)

## SPAN overlaps

In [None]:
dfs_overlap = compute_overlaps(dfs)

In [None]:
dfsa_overlap = compute_overlaps(dfsa)

In [None]:
dfao = pd.concat([dfma_overlap, dfm_overlap, dfs_overlap, dfsa_overlap])
dfao['jaccard'] = [
    d(lo, l1 + l2 - lo)
    for l1, l2, lo in zip(dfao['peaks1_len'], dfao['peaks2_len'], dfao['overlap_len'])
]

In [None]:
print('Jaccard')

plt.figure(figsize=(3, 4))
sns.barplot(data=dfao,
            x='name', y='jaccard', hue='tool',
            ci='sd', capsize=.05, errwidth=2)
plt.xticks(rotation=90)
plt.title('Jaccard for cell type')
plt.tight_layout()
plt.show()

# Overlap with DHS

In [None]:
from itertools import product

DHS_PATH = f'{PATH}/ENCFF754WCT_mm10_dhs_representative_sites.bed'
DHS_PEAKS = lines(DHS_PATH)

def compute_dhs_overlaps(df):
    dfoverlap = pd.DataFrame(columns=['name', 'tool', 'file', 'way', 'peaks', 'overlap'], dtype=object)
    for n in sorted(set(df['name'])):
        for tool in sorted(set(df['tool'])):
            dfnl = df.loc[(df['name'] == n) &  (df['tool'] == tool)]
            print('Processing', n, tool, len(dfnl))
            for _, row in tqdm(dfnl.iterrows()):
                file, peaks = row['file'], row['peaks']
                tf = f'{PATH}/overlaps_{tool}_{n}_{tool}_vs_dhs.bed'.replace(' ', '_')
                !bedtools intersect -a {file} -b {DHS_PATH} -wa -u > {tf}
                owd = lines(tf)
                dfoverlap.loc[len(dfoverlap)] = (n, tool, file, 'with_dhs', peaks, owd)
                tf = f'{PATH}/overlaps_{tool}_{n}_{tool}_dhs_vs.bed'.replace(' ', '_')
                !bedtools intersect -b {file} -a {DHS_PATH} -wa -u > {tf}
                odw = lines(tf)
                dfoverlap.loc[len(dfoverlap)] = (n, tool, file, 'dhs_with', DHS_PEAKS, odw)
    return dfoverlap

In [None]:
dfma_dhs_overlap = compute_dhs_overlaps(dfma)

In [None]:
dfm_dhs_overlap = compute_dhs_overlaps(dfm)

In [None]:
dfs_dhs_overlap = compute_dhs_overlaps(dfs)

In [None]:
dfsa_dhs_overlap = compute_dhs_overlaps(dfsa)

In [None]:
dhs_overlap = pd.concat([dfma_dhs_overlap, dfm_dhs_overlap, dfs_dhs_overlap, dfsa_dhs_overlap])
dhs_overlap['overlap'] = [o / p if p > 0 else 0 for o, p in zip(dhs_overlap['overlap'], dhs_overlap['peaks'])]

In [None]:
plt.figure(figsize=(6, 4))
# Plot 
ax = plt.subplot(1, 2, 1)
ax.title.set_text('Overlap with DHS')
sns.barplot(data=dhs_overlap[dhs_overlap['way']=='with_dhs'], x='name', y='overlap', hue='tool',
            ci='sd', capsize=.05, errwidth=2, ax=ax)
ax.xaxis.set_tick_params(rotation=90)
ax.set_xlabel('Cell')
ax.set_ylabel('Fraction')
ax.legend(loc='lower left', title='tool')

ax = plt.subplot(1, 2, 2)
ax.title.set_text('Overlap DHS with')
t = dhs_overlap[dhs_overlap['way']=='dhs_with'].copy()
t['overlap'].clip(upper=0.15, inplace=True)
sns.barplot(data=t, x='name', y='overlap', hue='tool',
            ci='sd', capsize=.05, errwidth=2, ax=ax)
ax.xaxis.set_tick_params(rotation=90)
ax.set_xlabel('Cell')
ax.set_ylabel('Fraction')
ax.legend(loc='lower left', title='tool')

plt.tight_layout()
plt.show()