# PRJN392905 Immgen - ATAC-Seq MACS vs SPAN automatic markup

Prerequisites
```
git checkout https://github.com/JetBrains-Research/sc-atacseq-smk-pipeline
cd /mnt/stripe/shpynov/sc-atacseq-smk-pipeline

for Q in 0.5 0.1 0.05 0.01 0.001 1E-4 1E-6 1E-9 1E-12; do 
    echo $Q; 
    snakemake -s /mnt/stripe/shpynov/chipseq-smk-pipeline/Snakefile all --cores 24 --use-conda --config work_dir=/mnt/stripe/shpynov/PRJN392905 genome=mm10 fastq_dir=/mnt/stripe/shpynov/PRJN392905/fastq macs2_suffix=q${Q} macs2_params=-q ${Q} -f BAMPE --nomodel --nolambda -B --call-summits span_fdr=${Q} span_params=--fragment 0; 
done
```

In [None]:
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pybedtools import BedTool
import subprocess
import math
import glob
import re

In [None]:
CELLS = [re.sub('.*/|_q.*', '', f) for f in glob.glob('/mnt/stripe/shpynov/PRJN392905/macs2/*q0.05*.narrowPeak')]
print(len(CELLS))

In [None]:
# discover MACS2 peak files
macs2_peaks_paths = {}
for cell in CELLS:
    macs2_peaks_paths[cell] = {
        q: os.path.join(EXPERIMENT_FOLDER, "macs2",
                        "{}_q{}_peaks.narrowPeak".format(cell, q)) for q in QS}


df_macs2_path = os.path.join(EXPERIMENT_FOLDER, 'df_macs2.csv')
if not os.path.exists(df_macs2_path):
    df_macs2 = pd.DataFrame(columns=['peak_caller', 'cell', 'q', 'bin', 'gap',
                                     'peak_count', 'dhs_by_peak', 'peak_by_dhs'])
else:
    print('Loading', df_macs2_path)
    df_macs2 = pd.read_csv(df_macs2_path)

    
for i, cell in enumerate(CELLS):
    print('Processing cell', i + 1, 'of', len(CELLS))
    for q in QS:
        # Already have it processed
        if len(df_macs2[np.logical_and(df_macs2['cell'] == cell, df_macs2['q'] == float(q))]) == 0:
            print("Processing {} for q={}".format(cell, q))
            peaks_file = BedTool(macs2_peaks_paths[cell][q])
            peak_count = peaks_file.count()
            overlap = peaks_file.intersect(dhs_mm10, wa=True, u=True).count()
            peak_by_dhs = 0 if overlap == 0 else overlap * 100.0 / peak_count
            overlap = dhs_mm10.intersect(peaks_file, wa=True, u=True).count()
            dhs_by_peak = overlap * 100.0 / dhs_mm10.count()
            df_macs2.loc[len(df_macs2)] = ('macs2', cell, float(q), math.nan, math.nan,
                                           peak_count, dhs_by_peak, peak_by_dhs)
print('Saved', df_macs2_path)
df_macs2.to_csv(df_macs2_path, index=None)

df_macs2.head()

# MACS2 peaks

In [None]:
EXPERIMENT_FOLDER = "/mnt/stripe/shpynov/PRJN392905"
QS = ["0.5", "0.1", "0.05", "0.01", "0.001", "1E-4", "1E-6", "1E-9", "1E-12"]
# SPAN uses different FDR naming
SPAN_QS = ["0.5", "0.1", "0.05", "0.01", "0.001", "0.0001", "1e-06", "1e-09", "1e-12"]

# DHS was obtained from https://www.encodeproject.org
# hg19 representative DHS sites: https://www.encodeproject.org/annotations/ENCSR664FOJ/
dhs_mm10 = BedTool(os.path.join(EXPERIMENT_FOLDER, "dhs", "mm10_dhs.bed"))

CHROM_SIZES = os.path.join(EXPERIMENT_FOLDER, "mm10.chrom.sizes")

# Span peaks

In [None]:
# discover Span peak files
def span_peaks_path(cell, q):
    return os.path.join(EXPERIMENT_FOLDER, "span","{}_100_{}_5.peak".format(cell, q))

span_peaks_paths = {}
for cell in CELLS:
    span_peaks_paths[cell] = { q: span_peaks_path(cell, SPAN_QS[i]) for i, q in enumerate(QS)}


df_span_path = os.path.join(EXPERIMENT_FOLDER, 'df_span.csv')
if not os.path.exists(df_span_path):
    df_span = pd.DataFrame(columns=['peak_caller', 'cell', 'q', 'bin', 'gap',
                                     'peak_count', 'dhs_by_peak', 'peak_by_dhs'])
else:
    print('Loading', df_span_path)
    df_span = pd.read_csv(df_span_path)

for i, cell in enumerate(CELLS):
    for q in QS:
        # Already have it processed
        if len(df_span[np.logical_and(df_span['cell'] == cell, df_span['q'] == float(q))]) == 0:
            print("Processing {} for q={}".format(cell, q))
            peaks_file = BedTool(span_peaks_paths[cell][q])
            peak_count = peaks_file.count()
            overlap = peaks_file.intersect(dhs_mm10, wa=True, u=True).count()
            peak_by_dhs = 0 if overlap == 0 else overlap * 100.0 / peak_count
            overlap = dhs_mm10.intersect(peaks_file, wa=True, u=True).count()
            dhs_by_peak = overlap * 100.0 / dhs_mm10.count()
            df_span.loc[len(df_span)] = ('span', cell, float(q), 100, 5,
                                           peak_count, dhs_by_peak, peak_by_dhs)

print('Saved', df_span_path)
df_span.to_csv(df_span_path, index=None)
df_span.head()

# Automatic markup

In [None]:
!%% bash
# Bash commands to create markup by Immgen

OUT=/mnt/stripe/shpynov/PRJN392905/intersect.tsv;
T=$'\t'; 
printf %s "chr${T}start${T}end" > ${OUT}; 
FILES=(); 
for F in $(find /mnt/stripe/shpynov/PRJN392905/macs2_q0.05/ -name "*.narrowPeak"); do 
	FILES+=("$F"); 
	printf %s "${T}${F}" >> ${OUT}; 
done; 
echo >> ${OUT};
bedtools multiinter -i "${FILES[@]}" |\
	bedtools merge -c $(seq -s, 6 1 $((${#FILES[@]} + 5))) -o max |\
	awk '{if (NR > 1) printf("\n"); printf("%s\t%s\t%s", $1, $2, $3); for (i=4; i<=NF; i++) printf("\t%d", int($i)); }' >> ${OUT};

# Find out regions where all the peaks present
ALL=""; 
for F in $(seq 1 1 ${#FILES[@]}); do 
    ALL="${ALL}${T}1"; 
done; 
cat ${OUT} | grep "${ALL}" | awk -v OFS='\t' '{print $1,$2,$3}' > /mnt/stripe/shpynov/PRJN392905/intersect_all.bed

# Find regions interesting with at least 80% by DHS and report original DHS.
bedtools intersect -b /mnt/stripe/shpynov/PRJN392905/intersect_all.bed -a /mnt/stripe/shpynov/PRJN392905/dhs/mm10_dhs.bed -F 0.8 -f 0.8 -wa -wb > /mnt/stripe/shpynov/PRJN392905/mm10_dhs_intersect_all_0.8.bed

# Total 200 peaks
# 100 peaks
head -n 50 /mnt/stripe/shpynov/PRJN392905/mm10_dhs_intersect_all_0.8.bed | while read -r LINE; do echo "$LINE" | awk -v OFS='\t' '{print $1,$2,$3,"peaks"}'; done > /mnt/stripe/shpynov/PRJN392905/markup.bed

#50 peakStart
head -n 100 /mnt/stripe/shpynov/PRJN392905/mm10_dhs_intersect_all_0.8.bed | tail -n 50 | while read -r LINE; do echo "$LINE" | awk '{ printf("%s\t%d\t%d\t%s\n", $1,$2,($2+$3)/2 - 1,"peakStart")}'; done >> /mnt/stripe/shpynov/PRJN392905/markup.bed

#50 peakEnd
head -n 150 /mnt/stripe/shpynov/PRJN392905/mm10_dhs_intersect_all_0.8.bed | tail -n 50 | while read -r LINE; do echo "$LINE" | awk '{printf("%s\t%d\t%d\t%s\n", $1,($2+$3)/2 + 1,$3,"peakEnd")}'; done >> /mnt/stripe/shpynov/PRJN392905/markup.bed

# extended markup
cat /mnt/stripe/shpynov/PRJN392905/markup.bed | while read -r LINE; do echo "$LINE" | awk '{print($1,$2-2000,$3+2000)}'; done > /mnt/stripe/shpynov/PRJN392905/markup_ext.bed



# SPAN tuning
```
# Tune models
snakemake -s /mnt/stripe/shpynov/chipseq-smk-pipeline/Snakefile all --cores 24 --use-conda --config work_dir=/mnt/stripe/shpynov/PRJN392905 genome=mm10 fastq_dir=/mnt/stripe/shpynov/PRJN392905/fastq span_bin=100 span_markup=/mnt/stripe/shpynov/PRJN392905/markup.bed

# Rename tuned
mkdir span_tuned
for F in /mnt/stripe/shpynov/PRJN392905/span/*tuned.peak; do echo $F; P=$(head -n 1 $F | sed -E 's/(^.*_100_)|(_1\t.*$)//g'); cp -f $F /mnt/stripe/shpynov/PRJN392905/span_tuned/$(echo $F | sed "s/tuned/$P/g" | sed 's#.*/##g'); done
```

In [None]:
import glob
# discover Span peak files

def span_tuned_peaks_path(cell):
    return glob.glob(EXPERIMENT_FOLDER + "/span_tuned/*" + cell + "*.peak")[0]

span_tuned_peaks_paths = {}
for cell in CELLS:
    path = span_tuned_peaks_path(cell)
    q, gap = re.sub('.*_seq_100_|\.peak', '', path).split('_')
    span_tuned_peaks_paths[cell] = (path, q, gap)

df_span_tuned_path = os.path.join(EXPERIMENT_FOLDER, 'df_span_tuned.csv')
if not os.path.exists(df_span_tuned_path):
    df_span_tuned = pd.DataFrame(columns=['peak_caller', 'cell', 'q', 'bin', 'gap',
                                     'peak_count', 'dhs_by_peak', 'peak_by_dhs'])
else:
    print('Loading', df_span_tuned_path)
    df_span_tuned = pd.read_csv(df_span_tuned_path)

for i, cell in enumerate(CELLS):
    # Already have it processed
    if len(df_span_tuned[df_span_tuned['cell'] == cell]) == 0:
        print("Processing {}".format(cell))
        peaks_path, q, gap = span_tuned_peaks_paths[cell]
        peaks_file = BedTool(peaks_path)
        peak_count = peaks_file.count()
        overlap = peaks_file.intersect(dhs_mm10, wa=True, u=True).count()
        peak_by_dhs = 0 if overlap == 0 else overlap * 100.0 / peak_count
        overlap = dhs_mm10.intersect(peaks_file, wa=True, u=True).count()
        dhs_by_peak = overlap * 100.0 / dhs_mm10.count()
        df_span_tuned.loc[len(df_span_tuned)] = ('span_tuned', cell, float(q), 100, int(gap),
                                       peak_count, dhs_by_peak, peak_by_dhs)

print('Saved', df_span_tuned_path)
df_span_tuned.to_csv(df_span_tuned_path, index=None)
df_span_tuned.head()

In [None]:
main_dataframe = pd.concat([df_macs2, df_span, df_span_tuned])
main_dataframe = main_dataframe.sort_values(['cell', 'peak_caller', 'q'])
main_dataframe.to_csv(os.path.join(EXPERIMENT_FOLDER, 'df.csv'), index=None)

overlap_dataframe = main_dataframe[np.logical_not(np.isnan(main_dataframe['dhs_by_peak']))]
overlap_dataframe.describe()

# Failed tracks

In [None]:
failed_cells = list(set(overlap_dataframe[overlap_dataframe['peak_count'] > 200000]['cell']))
print('Failed', failed_cells)
overlap_dataframe=overlap_dataframe[[c not in failed_cells for c in overlap_dataframe['cell']]]

# Plot DHS AUC overlap 
NOTE: there is information only about the monocytes

In [None]:
for cell in set(overlap_dataframe['cell']):
    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111)
    span_overlap_dataframe = \
    overlap_dataframe[np.logical_and(overlap_dataframe['cell'] == cell,
                                     overlap_dataframe['peak_caller'] == 'span')]
    plt.plot(span_overlap_dataframe['dhs_by_peak'], 
             span_overlap_dataframe['peak_by_dhs'], 
             'o-', 
             label='Span')
    for i,(x,y) in enumerate(zip(
        span_overlap_dataframe['dhs_by_peak'], 
        span_overlap_dataframe['peak_by_dhs'])):
        ax.annotate(str(QS[i]), xy=(x,y))

    macs2_overlap_dataframe = overlap_dataframe[np.logical_and(overlap_dataframe['cell'] == cell,
                                                               overlap_dataframe['peak_caller'] == 'macs2')]
    plt.plot(macs2_overlap_dataframe['dhs_by_peak'], 
             macs2_overlap_dataframe['peak_by_dhs'], 
             's-', label='MACS2')
    for i,(x,y) in enumerate(zip(
        macs2_overlap_dataframe['dhs_by_peak'], 
         macs2_overlap_dataframe['peak_by_dhs'])):
        ax.annotate(str(QS[i]), xy=(x,y))

    plt.legend()
    plt.title("Two-sided overlap with {} DHSs".format(cell))
    plt.xlabel("DHSs overlapped by peaks, %")
    plt.ylabel("peaks overlapped by DHSs, %")
    plt.tight_layout()
    plt.savefig('overlap_ROC.png', dpi=300)
    plt.show()

# Peaks counts vs True positives visualization

In [None]:
for cell in set(overlap_dataframe['cell']):
    cell_df = overlap_dataframe[np.logical_and(overlap_dataframe['cell'] == cell,
                                               np.logical_and(overlap_dataframe['q'] >= 1e-10,
                                                             overlap_dataframe['q'] <= 0.1))]
    span_peak_count = cell_df[cell_df['peak_caller'] == 'span']
    span_tuned_peak_count = cell_df[cell_df['peak_caller'] == 'span_tuned']
    macs2_peak_count = cell_df[cell_df['peak_caller'] == 'macs2']

    fig, ax1 = plt.subplots(figsize=(7, 5))
    ax1.set_ylim(0, 110)
    ax1.plot(span_peak_count['q'], span_peak_count['peak_by_dhs'], 'o:', c='green', label='SPAN')
    ax1.plot(macs2_peak_count['q'], macs2_peak_count['peak_by_dhs'], 'o:', c='blue', label='MACS2')
    ax1.plot(span_tuned_peak_count['q'], span_tuned_peak_count['peak_by_dhs'], 
             'o', c='red', label='SPAN tuned')
    ax1.axhline(y=80, color='r', linestyle=':')
    ax1.set_ylabel('true positives %')
    ax1.set_xlabel('fdr')

    ax2 = ax1.twinx()
    ax2.set_ylim(0, 1.1 * max(cell_df['peak_count']))
    ax2.plot(span_peak_count['q'], span_peak_count['peak_count'], 's-', c='green', label='SPAN')
    ax2.plot(macs2_peak_count['q'], macs2_peak_count['peak_count'], 's-', c='blue', label='MACS2')
    ax2.plot(span_tuned_peak_count['q'], span_tuned_peak_count['peak_count'], 's', c='red', label='SPAN tuned')
    ax2.axhline(y=40000, color='gray', linestyle='--')
    ax2.set_xscale('log')
    ax2.set_ylabel('peak count')

    plt.title('Peaks ' + cell)        
    plt.legend()

    fig.tight_layout()
    plt.show()

# Number of peaks / Consistency between MACS2 Q=0.05 and SPAN tuned between replicates

In [None]:
from pathlib import Path
import downstream.bed_metrics as bm

processed = set()
df_consistency = pd.DataFrame(columns=['cell', 'peak_caller', 
                                       'avg_peak_count', 'stdev_peak_count',
                                       'avg_dhs_by_peak', 'stdev_dhs_by_peak', 
                                       'avg_peak_by_dhs', 'stdev_peak_by_dhs',
                                       'avg_overlap', 'stdev_overlap'])
for cell in CELLS:
    im_cell = re.sub('(^GSM[0-9]+_)|(_ATAC_seq.*$)', '', cell)
    if im_cell in processed:
        continue
    processed.add(im_cell)
    print(cell, im_cell)
    cell_df = overlap_dataframe[[im_cell in cell for cell in overlap_dataframe['cell']]]
    if len(cell_df) == 1:
        continue
    
    span_tuned_peaks_df = cell_df[cell_df['peak_caller'] == 'span_tuned']
    span_tuned_paths = [Path(span_tuned_peaks_path(cell)) for cell in span_tuned_peaks_df['cell']]
    span_tuned_overlap_df_path = EXPERIMENT_FOLDER + '/overlap_span_tuned_{}.tsv'.format(im_cell)
    overlap_span = bm.load_or_build_metrics_table(span_tuned_paths, 
                                                  span_tuned_paths, 
                                                  Path(span_tuned_overlap_df_path), 
                                                  jaccard=False, 
                                                  threads=30)
    df_consistency.loc[len(df_consistency)] = (im_cell, 'span_tuned', 
                                               span_tuned_peaks_df['peak_count'].mean(),
                                               span_tuned_peaks_df['peak_count'].std(),
                                               span_tuned_peaks_df['dhs_by_peak'].mean(),
                                               span_tuned_peaks_df['dhs_by_peak'].std(),
                                               span_tuned_peaks_df['peak_by_dhs'].mean(),
                                               span_tuned_peaks_df['peak_by_dhs'].std(),
                                               overlap_span.values.mean(), 
                                               overlap_span.values.std(ddof=1) )
    
    macs2_peaks_df = cell_df[np.logical_and(cell_df['peak_caller'] == 'macs2', cell_df['q'] == 0.05)]
    macs2_paths = [Path(macs2_peaks_path(cell, 0.05)) for cell in macs2_peaks_df['cell']]
    macs2_overlap_df_path = EXPERIMENT_FOLDER + '/overlap_macs2_{}.tsv'.format(im_cell)
    overlap_macs2 = bm.load_or_build_metrics_table(macs2_paths, 
                                                  macs2_paths, 
                                                  Path(macs2_overlap_df_path), 
                                                  jaccard=False, 
                                                  threads=30)
    df_consistency.loc[len(df_consistency)] = (im_cell, 'macs2',
                                               macs2_peaks_df['peak_count'].mean(),
                                               macs2_peaks_df['peak_count'].std(),
                                               macs2_peaks_df['dhs_by_peak'].mean(),
                                               macs2_peaks_df['dhs_by_peak'].std(),
                                               macs2_peaks_df['peak_by_dhs'].mean(),
                                               macs2_peaks_df['peak_by_dhs'].std(),
                                               overlap_macs2.values.mean(),
                                               overlap_macs2.values.std(ddof=1))

df_consistency

# LogFold change of #peaks vs LogFold change in true positives

In [None]:
import seaborn as sns

q_df = overlap_dataframe[np.logical_and(overlap_dataframe['q'] >= 1e-6,
                                        overlap_dataframe['q'] <= 0.1)].copy()

for q in sorted(set(q_df['q'])):
    ddf = pd.DataFrame(columns=['lf_peak_by_dhs', 'lf_peak_count', 'q', 'peak_by_dhs', 'peak_count'])
    for cell in set(overlap_dataframe['cell']):
        cq_df = q_df[np.logical_and(q_df['q'] ==q, q_df['cell'] == cell)]
        cqmacs2_df = cq_df[cq_df['peak_caller']=='macs2']
        cqspan_df = cq_df[cq_df['peak_caller']=='span']
        peak_by_dhs_span = list(cqspan_df['peak_by_dhs'])[0]
        peak_by_dhs_macs2 = list(cqmacs2_df['peak_by_dhs'])[0]
        lf_peak_by_dhs = np.log2((peak_by_dhs_span + 1) / (peak_by_dhs_macs2 + 1))
        peak_count_span = list(cqspan_df['peak_count'])[0]
        peak_count_macs2 = list(cqmacs2_df['peak_count'])[0]
        lf_peak_count = np.log2((peak_count_span + 1) / (peak_count_macs2 + 1))
        ddf.loc[len(ddf)] = (lf_peak_by_dhs, lf_peak_count, q, peak_by_dhs_span, peak_count_macs2)
    plt.figure(figsize=(8, 8))        
    sns.scatterplot(data=ddf, x='lf_peak_by_dhs', y='lf_peak_count', hue='peak_by_dhs', size='peak_count')
    plt.axhline(y=0.0, color='b', linestyle=':')
    plt.axvline(x=0.0, color='b', linestyle=':')
    plt.title('q {}'.format(q))
    plt.xlabel('LF change true positives SPAN / MACS2')
    plt.ylabel('LF change peaks SPAN / MACS2')
    plt.show()

# 2-sided overlap with DHS

In [None]:
import seaborn as sns
for cell in CELLS:
    mdf = overlap_dataframe[overlap_dataframe['cell'] == cell]\
    [['peak_caller', 'q', 'peak_count', 'dhs_by_peak', 'peak_by_dhs']].copy()
    dhs_file = DHS_FILES[cell] if cell in DHS_FILES else dhs_hg19
    mdf['dhs_ex'] = (np.ones(len(mdf)) - mdf['dhs_by_peak'] / 100) * dhs_file.count()
    mdf['peaks_ex'] = (np.ones(len(mdf)) - mdf['peak_by_dhs'] / 100) * mdf['peak_count']
    mdf['both'] = mdf['peak_by_dhs'] * mdf['peak_count'] / 100

    axs = {}

    mdf['qp'] = mdf['q'].astype(str) + " " + mdf['peak_caller']
    qpl = len(set(mdf['qp']))
    fig = plt.figure(figsize=(qpl, 4))
    offset = 0
    for q in QS:
        data = mdf.loc[mdf['q'] == float(q)]
        xlabels = []
        for t in data['peak_caller']:
            if t not in xlabels:
                xlabels.append(t)
        w = len(set(data['peak_caller']))
        ax = plt.subplot2grid((1, qpl), (0, offset), colspan=w)
        p1 = ax.bar(data['peak_caller'], data['both'])
        p2 = ax.bar(data['peak_caller'], data['peaks_ex'],
                    bottom=data['both'])
        p3 = ax.bar(data['peak_caller'], data['dhs_ex'],
                    bottom=-data['dhs_ex'])

        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel('peaks')

        offset += w
        ax.set_xlabel('')
        ax.set_title(cell + ' ' + q)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=90)

    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])

    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()
    plt.show()