# Immgen PRJNA392905 - ATAC-Seq SPAN, MACS2 replicates consistency

Logbook: https://docs.google.com/document/d/1WxzLWUX0PV2TpD0VfwStHczN6YSQ35Uu6DDR-8WX-io/edit#heading=h.xg4nq1px6lhh

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os
import glob
import re
from pybedtools import BedTool
from tqdm.auto import tqdm

In [None]:
PATH = '/mnt/stripe/shpynov/2021_Immgen_atacseq'

def file_to_name(file):
    return re.sub('(.*GSM[0-9]+_)|(_ATAC_seq.*)', '', file)

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0) # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

# Macs2

In [None]:
MACS2_FOLDER='/mnt/stripe/shpynov/2021_Immgen_atacseq/macs2'
LEVELS = ['q0.05']

dfm = pd.DataFrame(columns=['name', 'level', 'file', 'peaks', 'length'], dtype=object)

for file in tqdm(glob.glob(MACS2_FOLDER + '/*.narrowPeak')):
    level = next((l for l in LEVELS if f'_{l}_' in file), None)
    if level:
        name = file_to_name(file)
        dfm.loc[len(dfm)] = (name, f'macs2 {level}', file, lines(file), bedl(file).sum())

dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)

In [None]:
print('Dataset size')
t = (dfm.groupby(['name'])['file'].count() / len(LEVELS)).astype(int)
plt.figure(figsize=(8, 4))
sns.barplot(data=pd.DataFrame(dict(name=t.index, replicates=t)), x='name', y='replicates')
plt.xticks(rotation=90)
plt.show()

In [None]:
display((dfm.groupby(['name'])['file'].count() / len(LEVELS)).astype(int))

In [None]:
print('Mean peaks')
display(dfm.groupby(['name', 'level'])['peaks'].mean())
print('Std peaks')
display(dfm.groupby(['name', 'level'])['peaks'].std())

In [None]:
dfm['av_length'] = [l / p if p != 0 else 0 for p, l in zip(dfm['peaks'], dfm['length'])]       
print('Mean average length')
display(dfm.groupby(['name', 'level'])['av_length'].mean())
print('Std average length')
display(dfm.groupby(['name', 'level'])['av_length'].std())

# Visualization

In [None]:
def plotdf(df, levels, title):
    # Create a copy for inplace modifications
    dfw = df.copy()

    # Column to identify all the possibilities
    dfw['ln'] = dfw['name'] + " " + dfw['level']
    lns = len(set(dfw['ln']))
    axs = {}

    fig = plt.figure(figsize=(lns / 4, 6))
    offset = 0
    for l in levels:
        data = dfw.loc[dfw['level'] == l].sort_values(by=['name'])
        xlabels = []
        for n in data['name']:
            if n not in xlabels:
                xlabels.append(n)
        lnd = len(set(data['ln']))
        ax = plt.subplot2grid((1, lns), (0, offset), colspan=lnd)
        sns.barplot(data=data, 
                 x="name", y=title,
                 ci="sd", capsize=.2, errwidth=2,
                 edgecolor="black",
                 ax = ax)

        sns.swarmplot(data=data,
              x="name", y=title,
              size=2,
              color='darkgrey',
              ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(title)

        offset += lnd
        ax.set_xlabel('')
        ax.set_title(l)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=90)

    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])

    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()

In [None]:
plotdf(dfm, ['macs2 q0.05'], 'peaks')
plt.show()

# SPAN

In [None]:
import subprocess
import re
from pybedtools import BedTool

SPAN_FOLDER='/mnt/stripe/shpynov/2021_Immgen_atacseq/span-islands'
FDRS = [0.05]

dfs = pd.DataFrame(columns=['name', 'level', 'file', 'peaks', 'length'], dtype=object)

for file in tqdm(glob.glob(SPAN_FOLDER + '/*300*.islands')):
    fdr = next((f for f in FDRS if f'_{f}_' in file), None)
    if fdr:
        name = file_to_name(file)
        dfs.loc[len(dfs)] = (name, f'span {fdr}', file, lines(file), bedl(file).sum())

dfs['peaks'] = dfs['peaks'].astype(int)
dfs['length'] = dfs['length'].astype(int)

In [None]:
display((dfs.groupby(['name'])['file'].count() / len(LEVELS)).astype(int))

In [None]:
print('Mean peaks')
display(dfs.groupby(['name', 'level'])['peaks'].mean())
print('Std peaks')
display(dfs.groupby(['name', 'level'])['peaks'].std())

In [None]:
dfs['av_length'] = [l / p if p != 0 else 0 for p, l in zip(dfs['peaks'], dfs['length'])]       
print('Mean average length')
display(dfs.groupby(['name', 'level'])['av_length'].mean())
print('Std average length')
display(dfs.groupby(['name', 'level'])['av_length'].std())

In [None]:
plotdf(dfs, ['span 0.05'], 'peaks')
plt.show()

In [None]:
print('Summary peaks')
dfa = pd.concat([dfm, dfs])
# plt.figure(figsize=(12, 7))
# sns.boxplot(data=dfa, x='name', y='peaks', hue='level')            
# plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()


plt.figure(figsize=(12, 7))
sns.barplot(data=dfa, x='name', y='peaks', hue='level',
            ci='sd', capsize=.2, errwidth=2, alpha=0.8)            
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
print('Peaks length distribution')
dfa['av_length'].clip(upper=5000, inplace=True) # For visualization
plt.figure(figsize=(12, 7))
sns.boxplot(data=dfa, x='name', y='av_length', hue='level')            
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Overlaps

In [None]:
from itertools import product

def compute_overlaps(df, tool):
    dfoverlap = pd.DataFrame(columns=['tool', 'name', 'level', 'file1', 'file2', 'overlap'], dtype=object)
    for n in sorted(set(df['name'])):
        for l in sorted(set(df['level'])):
            dfnl = df.loc[(df['name'] == n) &  (df['level'] == l)]
            print('Processing', n, l, len(dfnl))
            files = list(dfnl['file'])
            for (f1, f2) in tqdm(product(files, files)):
                if f1 != f2:
                    tf = f'{PATH}/overlaps_{tool}_{n}_{l}_overlaps.bed'.replace(' ', '_')
                    !bedtools intersect -a {f1} -b {f2} -wa -u > {tf}
                    overlap = lines(tf)
                    dfoverlap.loc[len(dfoverlap)] = (tool, n, l, f1, f2, overlap)
    return dfoverlap

## MACS2 overlaps

In [None]:
dfm_overlap = compute_overlaps(dfm, 'macs2')
dfm_overlapf = []
for i, row in tqdm(dfm_overlap.iterrows()):
    o, n, l, f = row['overlap'], row['name'], row['level'], row['file1']
    p = dfm[(dfm['name'] == n) & (dfm['level'] == l) & (dfm['file'] == f)]['peaks'].values[0]
    dfm_overlapf.append(o / p if p > 0 else 0)
dfm_overlap['overlapf'] = dfm_overlapf

print('Mean overlap')
display(dfm_overlap.groupby(['name', 'level'])['overlapf'].mean())
print('Std overlap')
display(dfm_overlap.groupby(['name', 'level'])['overlapf'].std())

In [None]:
plotdf(dfm_overlap, ['macs2 q0.05'], 'overlapf')
plt.show()

## SPAN overlaps

In [None]:
dfs_overlap = compute_overlaps(dfs, 'span')
dfs_overlapf = []
for i, row in tqdm(dfs_overlap.iterrows()):
    o, n, l, f = row['overlap'], row['name'], row['level'], row['file1']
    p = dfs[(dfs['name'] == n) & (dfs['level'] == l) & (dfs['file'] == f)]['peaks'].values[0]
    dfs_overlapf.append(o / p if p > 0 else 0)
dfs_overlap['overlapf'] = dfs_overlapf

print('Mean overlap')
display(dfs_overlap.groupby(['name', 'level'])['overlapf'].mean())
print('Std overlap')
display(dfs_overlap.groupby(['name', 'level'])['overlapf'].std())

In [None]:
plotdf(dfs_overlap, ['span 0.05'], 'overlapf')
plt.show()

In [None]:
print('Summary overlap')
dfao = pd.concat([dfm_overlap, dfs_overlap])
# plt.figure(figsize=(12, 7))
# sns.boxplot(data=dfao[dfao['level'].isin(['macs2 q0.05', 'span 0.05'])], x='name', y='overlapf', hue='level')            
# plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()

plt.figure(figsize=(12, 7))
sns.barplot(data=dfao[dfao['level'].isin(['macs2 q0.05', 'span 0.05'])], x='name', y='overlapf', hue='level',
            ci='sd', capsize=.2, errwidth=2, alpha=0.8)            
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Overlap with DHS

In [None]:
from itertools import product

DHS_PATH = f'{PATH}/mm10_dhs.bed'
DHS_PEAKS = lines(DHS_PATH)

def compute_dhs_overlaps(df, tool):
    dfoverlap = pd.DataFrame(columns=['tool', 'name', 'level', 'file', 'way', 'peaks', 'overlap'], dtype=object)
    for n in sorted(set(df['name'])):
        for l in sorted(set(df['level'])):
            dfnl = df.loc[(df['name'] == n) &  (df['level'] == l)]
            print('Processing', n, l, len(dfnl))
            for _, row in tqdm(dfnl.iterrows()):
                file, peaks = row['file'], row['peaks']
                tf = f'{PATH}/overlaps_{tool}_{n}_{l}_vs_dhs.bed'.replace(' ', '_')
                !bedtools intersect -a {file} -b {DHS_PATH} -wa -u > {tf}
                owd = lines(tf)
                dfoverlap.loc[len(dfoverlap)] = (tool, n, l, file, 'with_dhs', peaks, owd)
                tf = f'{PATH}/overlaps_{tool}_{n}_{l}_dhs_vs.bed'.replace(' ', '_')
                !bedtools intersect -b {file} -a {DHS_PATH} -wa -u > {tf}
                odw = lines(tf)
                dfoverlap.loc[len(dfoverlap)] = (tool, n, l, file, 'dhs_with', DHS_PEAKS, odw)
    return dfoverlap

In [None]:
dfm_dhs_overlap = compute_dhs_overlaps(dfm, 'macs2')

In [None]:
dfm_dhs_overlap['overlapf'] = [o / p if p > 0 else 0 
                               for o, p in zip(dfm_dhs_overlap['overlap'], dfm_dhs_overlap['peaks'])]
plt.figure(figsize=(12, 7))
sns.barplot(data=dfm_dhs_overlap, x='name', y='overlapf', hue='way', 
            ci='sd', capsize=.2, errwidth=2, alpha=0.8)            
plt.xticks(rotation=90)
plt.title('Two way overlaps with mm10 DHS')
plt.tight_layout()
plt.show()

In [None]:
dfs_dhs_overlap = compute_dhs_overlaps(dfs, 'span')

In [None]:
dfs_dhs_overlap['overlapf'] = [o / p if p > 0 else 0 
                               for o, p in zip(dfs_dhs_overlap['overlap'], dfs_dhs_overlap['peaks'])]

plt.figure(figsize=(12, 7))
sns.barplot(data=dfs_dhs_overlap, x='name', y='overlapf', hue='way',
            ci='sd', capsize=.2, errwidth=2, alpha=0.8)            
plt.xticks(rotation=90)
plt.title('Two way overlaps with mm10 DHS')
plt.tight_layout()
plt.show()

In [None]:
dhs_overlap = pd.concat([dfm_dhs_overlap, dfs_dhs_overlap])

In [None]:
plt.figure(figsize=(20, 7))
# Plot 
ax = plt.subplot(1, 2, 1)
ax.title.set_text('Overlap with DHS')
sns.barplot(data=dhs_overlap[dhs_overlap['way']=='with_dhs'], x='name', y='overlapf', hue='level',
            ci='sd', capsize=.2, errwidth=2, alpha=0.8, ax=ax)  
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Cell')
ax.set_ylabel('Overlap')

ax = plt.subplot(1, 2, 2)
ax.title.set_text('Overlap DHS with')
t = dhs_overlap[dhs_overlap['way']=='dhs_with'].copy()
t['overlap'].clip(upper=0.15, inplace=True)
sns.barplot(data=t, x='name', y='overlapf', hue='level',
            ci='sd', capsize=.2, errwidth=2, alpha=0.8, ax=ax)  
ax.xaxis.set_tick_params(rotation=90)
ax.set_ylabel('Cell')
ax.set_ylabel('Overlap')

plt.tight_layout()
plt.show()

### Venn diagram of DHS vs ATAC-seq peaks averaged by cell

In [None]:
import math
for l in ['macs2 q0.05', 'span 0.05']:
    log2rt = []
    for i, row in dhs_overlap[dhs_overlap['level'] == l].iterrows():
        if i % 2 == 0:
            o = row['overlap']
        else:
            log2rt.append(math.log2(o / row['overlap']))
    print(l, np.mean(log2rt))

In [None]:
2 ** 0.14

In [None]:
2 ** -0.22