# 2021 - ENCODE GSE26320 peaks coverage.ipynb


See logbook: https://docs.google.com/document/d/1hjkhjo6cB4_AykuQVSDTU07IPD0t6988ZfXwmBb8q0A/edit#

0. Process peaks

```
cd /mnt/stripe/shpynov/chipseq-smk-pipeline

# MACS2 narrow
for FDR in 0.1 0.05 0.01 1e-3 1e-4 1e-6 1e-8 1e-10; do
    snakemake all --use-conda --cores all --directory /mnt/stripe/shpynov/2020_GSE26320/ \
    --config fastq_ext=fastq.gz fastq_dir=/mnt/stripe/shpynov/2020_GSE26320/fastq genome=hg19 \
    macs2_mode=narrow macs2_params="-q $FDR" macs2_suffix="q$FDR" \
    span_fdr=$FDR sicer_fdr=$FDR --rerun-incomplete;
done;

# MACS2 broad
for FDR in 0.1 0.05 0.01 1e-3 1e-4 1e-6 1e-8 1e-10; do
    snakemake all --use-conda --cores all --directory /mnt/stripe/shpynov/2020_GSE26320/ \
    --config fastq_ext=fastq.gz fastq_dir=/mnt/stripe/shpynov/2020_GSE26320/fastq genome=hg19 \
    macs2_mode=broad macs2_params="--broad --broad-cutoff $FDR" macs2_suffix="broad$FDR" \
    span_fdr=$FDR sicer_fdr=$FDR --rerun-incomplete;
done;

```

1. Estimate fragment size from MACS2 output = `241.381`

```
T=$'\t'
for F in macs2/*q0.05*.xls; do 
    NAME=$(basename $F | sed 's/_q.*//g'); D=$(cat $F | grep '# d =' | sed 's/.*= //g'); 
    echo "$NAME$T$D" >> fragment.tsv; 
done
cat fragment.tsv | awk '{D+=$2; N+=1} END {print(D/N)}'
```

2. Compute tags for coverage using `SHIFT = 125`

```
FOLDER=/mnt/stripe/shpynov/2021_GSE26320
cd $FOLDER

# Create BAM tags to compute coverage
cd bam
SHIFT=125
for BAM in *.bam; do
   echo ${BAM};
   bedtools bamtobed -i ${BAM} |\
      awk -v OFS='\t' -v S=${SHIFT} \
      '{if ($6 != "-") {print($1, $2+S, $2+S+1)} else {if ($3-S>=1) {print($1, $3-S, $3-S+1)}}}' |\
      sort -u -k1,1 -k3,3n -k2,2n > ${BAM/.bam/.tags}
done
mkdir ../tags
mv *.tags ../tags

```
3. Compute coverage in peaks

```
# Compute bed4 files
cd ../peaks
for F in *0.05*.peak; do
    echo $F;
    cat $F | awk -v OFS='\t' '{print $1,$2,$3,$5}' | sort -k1,1 -k2,2n > $F.bed4;
done;
mkdir ../bed4
mv *.bed4 ../bed4/

# Add coverage information to peaks
mkdir ../covs
cd ../bed4
CELLS=$(ls *.bed4 | sed -E 's/GSM[0-9]+_//g' | sed 's/_.*//g' | sort --unique);
for C in ${CELLS[@]}; do
   echo $C;
   for M in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3; do
       for R in rep1 rep2; do
          echo $R;
          for F in *${C}_${M}_${R}*.bed4; do
             echo $F;
             echo $(ls ../tags/*${C}_${M}_${R}*.tags);
             echo $(ls ../tags/*${C}_Input_${R}*.tags);
             bedtools intersect -a $F -b ../tags/*${C}_${M}_${R}*.tags -wa -c > ../covs/${F}t; 
             bedtools intersect -a ../covs/${F}t -b ../tags/*${C}_Input_${R}*.tags -wa -c > ../covs/${F}tc; 
             rm ../covs/${F}t; 
          done
       done
   done
done
```

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import subprocess, tempfile
from itertools import product
import plotly.graph_objects as go
import plotly.express as px

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0) # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

def join(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return ','.join(f'{c}:{s}-{e}' for c, s, e in zip(tf[0], tf[1], tf[2]))
    except:
        return '' # Empty file

def rank_correlation(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return spearmanr(tf[3], tf[7])[0]
    except:
        return 0 # Empty file
    
def d(a, b):
    return a / b if b != 0 else 0


# Peaks analysis

In [None]:
PATH = '/mnt/stripe/shpynov/2021_GSE26320'
# Don't have H1 H3K27ac rep2
# Don't have Huvec H3K4me3 rep1 
# Don't have HepG2 H3K4me1 rep2
CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF']  # Ignore H1, Huvec, HepG2
MODIFICATIONS = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']
REPS = ['rep1', 'rep2']

In [None]:
def load_peaks_fdr(path, suffix, fdrs):
    df_fdr = pd.DataFrame(columns=['file', 'modification', 'cell', 'replicate', 'fdr', 'peaks', 'avlength'], 
                      dtype=object)
    for f in tqdm(os.listdir(path)):
        if suffix not in f:
            continue
        fdr = next((fdr for fdr in fdrs if fdr in f), None)
        cell = next((cc for cc in CELLS if cc in f), None)
        mod = next((m for m in MODIFICATIONS if m in f), None)
        rep = 'rep1' if 'rep1' in f else 'rep2'
        if fdr and cell and rep and mod:
            ps, ls = lines(os.path.join(path, f)), bedl(os.path.join(path, f))
            avls = 0 if ps == 0 else sum(ls) / ps
            df_fdr.loc[len(df_fdr)] = (f, mod, cell, rep, fdr, ps, avls)
    return df_fdr

In [None]:
def plotdf(df, tool, title):
    fig = plt.figure(figsize=(3 * len(MODIFICATIONS), 4))
    for i, m in enumerate(MODIFICATIONS):
        t = df.loc[df['modification'] == m].copy()
        t['fdr'] = t['fdr'].astype(float)
        # Plot 
        ax = plt.subplot(1, len(MODIFICATIONS), i + 1)
        ax.title.set_text(m)
#         sns.boxplot(data=t, x="fdr", y=title, ax = ax)            
        sns.barplot(data=t, x="fdr", y=title, ax = ax, capsize=.2, errwidth=2, edgecolor="black")
        ax.xaxis.set_tick_params(rotation=90)
        ax.set_ylabel(title)

    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/fdr_{tool}_{title}.png', bbox_inches='tight', dpi=300)
    plt.show()

In [None]:
! mkdir /mnt/stripe/shpynov/2021_GSE26320/figures

### SPAN

In [None]:
SPAN_FDRS = ['0.1', '0.01', '0.05', '0.001', '0.0001', '1e-06', '1e-08', '1e-10']
df_fdr_span = load_peaks_fdr('/mnt/stripe/shpynov/2020_GSE26320/span-islands', '.islands', SPAN_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_span, 'span', 'peaks')

print('Average length')
plotdf(df_fdr_span, 'span', 'avlength')

In [None]:
# Troubleshooting
display(df_fdr_span[(df_fdr_span['modification'] == 'H3K27me3') & 
                    (df_fdr_span['fdr'] == '0.05') & 
                    (df_fdr_span['peaks'] < 5000)])

In [None]:
print('SPAN old peaks')
df_fdr_span_old = load_peaks_fdr('/mnt/stripe/shpynov/2020_GSE26320/span', '.peak', SPAN_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_span_old, 'span_old', 'peaks')

print('Average length')
plotdf(df_fdr_span_old, 'span_old', 'avlength')

### MACS narrow

In [None]:
MACS2_FDRS = ['0.1', '0.01', '0.05', '1e-3', '1-e4', '1e-6', '1e-8', '1e-10']
df_fdr_macs2 = load_peaks_fdr('/mnt/stripe/shpynov/2020_GSE26320/macs2', '.narrowPeak', MACS2_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_macs2, 'macs2', 'peaks')

print('Average length')
plotdf(df_fdr_macs2, 'macs2', 'avlength')

### MACS2 broad

In [None]:
MACS2BROAD_FDRS = ['0.1', '0.05', '1e-2', '1e-3', '1e-4', '1e-6', '1e-8', '1e-10']
df_fdr_macs2broad = load_peaks_fdr('/mnt/stripe/shpynov/2020_GSE26320/macs2', '.broadPeak', MACS2BROAD_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_macs2broad, 'macs2broad', 'peaks')

print('Average length')
plotdf(df_fdr_macs2broad, 'macs2broad', 'avlength')

### SICER

In [None]:
SICER_FDRS = ['0.1', '0.05', '0.01', '0.001', '0.0001', '1e-06', '1e-08', '1e-10']
df_fdr_sicer = load_peaks_fdr('/mnt/stripe/shpynov/2020_GSE26320/sicer', 'summary-FDR', SICER_FDRS)

In [None]:
print('Peaks')
plotdf(df_fdr_sicer, 'sicer', 'peaks')

print('Average length')
plotdf(df_fdr_sicer, 'sicer', 'avlength')

# Coverage analysis

In [None]:
FILES = os.listdir(os.path.join(PATH, 'covs'))
# display(FILES)

reps = []
tools = []
cells = []
mods = []
files = []

for f in FILES:
    rep, tool, cell, mod = None, None, None, None
    if 'rep1_rep2' in f:
        rep = 'rep1_rep2'  # ENCODE replicated peaks support
    else:
        rep = 'rep1' if 'rep1' in f else 'rep2'
    
    
    if 'rep1_rep2' in f:
        tool = 'ENCODE rep'
    elif 'ENCODE' in f:
        tool = 'ENCODE'
    elif '5.islands' in f:
        tool = 'SPAN'
    elif 'FDR' in f:
        tool = 'SICER'
    elif 'narrowPeak' in f:
        tool = 'MACS2'
    elif 'broadPeak' in f:
        tool = 'MACS2 broad'

    cell = next((cc for cc in CELLS if cc in f), None)
    mod = next((m for m in MODIFICATIONS if m in f), None)

    if rep and tool and cell and mod:
        reps.append(rep)
        tools.append(tool)
        cells.append(cell)
        mods.append(mod)
        files.append(f'{PATH}/covs/{f}')
        
df = pd.DataFrame(dict(cell=cells, modification=mods, rep=reps, file=files, tool=tools))
df

In [None]:
peaks = []
lengths = []
avlengths = []

for f in df['file']: 
    ps, ls = lines(f), bedl(f)
    avls = 0 if ps == 0 else sum(ls) / ps
    peaks.append(ps)
    lengths.append(sum(ls))
    avlengths.append(avls)

df['peaks'] = peaks
df['length'] = lengths
df['average length'] = avlengths
df

In [None]:
coverage_df = pd.DataFrame(columns=['cell', 'replicate', 'modification', 'tags'], dtype=object)
for c, r in tqdm(product(CELLS, REPS)):
    cc = \
        lines(f'{PATH}/tags/' + next((f for f in os.listdir(f'{PATH}/tags/') if f'{c}_Input_{r}' in f), None))    
    coverage_df.loc[len(coverage_df)] = (c, r, 'Input', cc)
    for m in MODIFICATIONS:
        ct = \
            lines(f'{PATH}/tags/' + next((f for f in os.listdir(f'{PATH}/tags/') if f'{c}_{m}_{r}' in f), None))
        coverage_df.loc[len(coverage_df)] = (c, r, m, ct)

        
def libcoveraget(m, c, r):
    return coverage_df[(coverage_df['modification'] == m) & 
                       (coverage_df['cell'] == c) & 
                      (coverage_df['replicate'] == r)]['tags'].values[0]
def libcoveragec(c, r):
    return libcoveraget('Input', c, r)

In [None]:
print('Libraries tags x10mln')
fig = plt.figure(figsize=(3 * (len(MODIFICATIONS) + 1), 4))
for i, m in enumerate(MODIFICATIONS + ['Input']):
    t = coverage_df.loc[coverage_df['modification'] == m].copy()
    t['name'] = t['cell'] + ' ' + t['replicate']
    ax = plt.subplot(1, len(MODIFICATIONS) + 1, i + 1)
    ax.title.set_text(m)
    sns.barplot(data=t, x='name', y='tags', ax=ax, color='blue')            
    ax.set_ylim(bottom = 0, top = 1.8e7)
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_ylabel('Tags')

plt.tight_layout()
plt.savefig(f'{PATH}/figures/libraries.png', bbox_inches='tight', dpi=300)
plt.show()

### Loading detailed peaks information

In [None]:
! mkdir {PATH}/bed
def coverage_t(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return [] # Empty file

def coverage_c(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[5]
    except:
        return [] # Empty file

def coverage_a(file, cell, replicate):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        scale = min(1, libcoveraget(cell, replicate) / libcoveragec(cell, replicate))
        return (tf[4] - tf[5] * scale).clip(lower=0)
    except:
        return [] # Empty file

def zip6(v1, v2, v3, v4, v5, arr):
    return [(v1, v2, v3, v4, v5, a) for a in arr]


df_comparison = pd.DataFrame(columns=[
    'modification', 'cell', 'replicate', 'tool1', 'tool2', 
    'peaks1', 'peaks2', 'length1', 'length2',
    'overlap1', 'overlap2', 'overlap1l', 'overlap2l', 
    'diff1', 'diff2', 'diff1l', 'diff2l',
    'cov1t', 'cov1c', 'cov1a', 
    'cov2t', 'cov2c', 'cov2a',
    'overlap1t', 'overlap1c', 'overlap1a',
    'overlap2t', 'overlap2c', 'overlap2a',                                      
    'diff1t', 'diff1c', 'diff1a',
    'diff2t', 'diff2c', 'diff2a'], dtype=object)

tools = list(sorted(set(df['tool'])))

df_peaks = []
for m, c, r in product(MODIFICATIONS, CELLS, REPS):
    print(m, c, r)

    covtm = libcoveraget(m, c, r) * 1e-6
    covcm = libcoveragec(c, r) * 1e-6

    for i in tqdm(range(len(tools))):
        tool1 = tools[i]
        t1 = df.loc[(df['cell'] == c) & (df['modification'] == m) & 
                    (df['rep'].str.contains(r)) & (df['tool']==tool1)]

        file1 = t1['file'].values[0]
#         print('Tool1', tool1, file1)

        peaks1, length1 = lines(file1), bedl(file1)
        df_peaks.extend(zip6(m, c, r, f'{tool1} ({peaks1})', 'length', length1))            
        cov1t, cov1c, cov1a = coverage_t(file1), coverage_c(file1), coverage_a(file1, c, r)
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'signal', 
                       [co / covtm for co in cov1t]))
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'control', 
                       [co / covcm for co in cov1c]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'aggregated', cov1a))
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'signal pbp', 
                       [d(co / covtm, l * 1e-3) for co, l in zip(cov1t, length1)]))
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'control pbp', 
                       [d(co / covcm, l * 1e-3) for co, l in zip(cov1c, length1)]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'aggregated pbp', [d(co, l) for co, l in zip(cov1a, length1)]))

            
        
        for j in range(i + 1, len(tools)):
            tool2 = tools[j]
            t2 = df.loc[(df['cell'] == c) & (df['modification'] == m) & 
                        (df['rep'].str.contains(r)) & (df['tool']==tool2)]

            file2 = t2['file'].values[0]
#             print('Tool2', tool2, file2)

            peaks2, length2 = lines(file2), bedl(file2)
            df_peaks.extend(zip6(m, c, r, f'{tool2} ({peaks2})', 'length', length2))
            cov2t, cov2c, cov2a = coverage_t(file2), coverage_c(file2), coverage_a(file2, c, r)
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'signal', 
                           [co / covtm for co in cov2t]))
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'control', 
                           [co / covcm for co in cov1c]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'aggregated', cov2a))
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(cov2t, length2)]))
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(cov2c, length2)]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'aggregated pbp', [d(co, l) for co, l in zip(cov2a, length2)]))


            tf = f'{PATH}/bed/{m}_{c}_{r}_overlap_{tool1}_vs_{tool2}.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {file2} -wa -u > {tf}
            overlap1, overlap1l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'length', overlap1l))            
            overlap1t, overlap1c, overlap1a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'signal', 
                           [co / covtm for co in overlap1t]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'control', 
                           [co / covcm for co in overlap1c]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'aggregated', overlap1a))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(overlap1t, overlap1l)]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(overlap1c, overlap1l)]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'aggregated pbp', [d(co, l) for co, l in zip(overlap1a, overlap1l)]))


            tf = f'{PATH}/bed/{m}_{c}_{r}_overlap_{tool2}_vs_{tool1}.bed'.replace(' ', '_')
            !bedtools intersect -b {file1} -a {file2} -wa -u > {tf}
            overlap2, overlap2l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'length', overlap2l))                        
            overlap2t, overlap2c, overlap2a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'signal', 
                           [co / covtm for co in overlap2t]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'control', 
                           [co / covcm for co in overlap2c]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'aggregated', overlap2a))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(overlap2t, overlap2l)]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(overlap2c, overlap2l)]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'aggregated pbp', [d(co, l) for co, l in zip(overlap2a, overlap2l)]))


            tf = f'{PATH}/bed/{m}_{c}_{r}_diff_{tool1}_vs_{tool2}.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {file2} -v > {tf}
            diff1, diff1l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'length', diff1l))                        
            diff1t, diff1c, diff1a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'signal', 
                           [co / covtm for co in diff1t]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'control', 
                           [co / covcm for co in diff1c]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'aggregated', diff1a))
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(diff1t, diff1l)]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(diff1c, diff1l)]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'aggregated pbp', [d(co, l) for co, l in zip(diff1a, diff1l)]))

            tf = f'{PATH}/bed/{m}_{c}_{r}_diff_{tool2}_vs_{tool1}.bed'.replace(' ', '_')        
            !bedtools intersect -b {file1} -a {file2} -v > {tf}
            diff2, diff2l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'length', diff2l))            
            diff2t, diff2c, diff2a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'signal', 
                           [co / covtm for co in diff2t]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'control', 
                           [co / covcm for co in diff2c]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'aggregated', diff2a))
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(diff2t, diff2l)]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(diff2c, diff2l)]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'aggregated pbp', [d(co, l) for co, l in zip(diff2a, diff2l)]))

            df_comparison.loc[len(df_comparison)] = (
                m, c, r, tool1, tool2, 
                peaks1, peaks2, sum(length1), sum(length2), 
                overlap1, overlap2, sum(overlap1l), sum(overlap2l),
                diff1, diff2, sum(diff1l), sum(diff2l),
                sum(cov1t), sum(cov1c), sum(cov1a), 
                sum(cov2t), sum(cov2c), sum(cov2a), 
                sum(overlap1t), sum(overlap1c), sum(overlap1a),
                sum(overlap2t), sum(overlap2c), sum(overlap2a),
                sum(diff1t), sum(diff1c), sum(diff1a),
                sum(diff2t), sum(diff2c), sum(diff2a)
            )
            
            
df_peaks = pd.DataFrame(df_peaks, columns=['modification', 'cell', 'replicate', 'name', 'type', 'value'])

In [None]:
display(df_peaks.head())

In [None]:
display(df_comparison.head())
df_comparison.to_csv(f'{PATH}/comparison.csv', index=False)

In [None]:
print('Agggregate peaks, length, coverage and density information')
df_comparison_average = pd.DataFrame(columns=['m', 'c', 'r', 'coverage', 'tool1', 'tool2', 
      'peaks1', 'peaks2', 'overlap1', 'overlap2', 'diff1', 'diff2',
      'avl1', 'avl2', 'ov_avl1', 'ov_avl2', 'diff_avl1', 'diff_avl2', 
      'avcov1', 'avcov2', 'ov_avcov1', 'ov_avcov2', 'diff_avcov1', 'diff_avcov2', 
      'covpbp1', 'covpbp2', 'ov_covpbp1', 'ov_covpbp2', 'diff_covpbp1', 'diff_covpbp2'], dtype=object)

for i, row in tqdm(df_comparison.iterrows()):
    (m, c, r, tool1, tool2, 
     peaks1, peaks2, length1, length2, 
     overlap1, overlap2, overlap1l, overlap2l,
     diff1, diff2, diff1l, diff2l,
     cov1t, cov1c, cov1a, 
     cov2t, cov2c, cov2a, 
     overlap1t, overlap1c, overlap1a,
     overlap2t, overlap2c, overlap2a,
     diff1t, diff1c, diff1a,
     diff2t, diff2c, diff2a) = row    
    df_comparison_average.loc[len(df_comparison_average)] = (
        m, c, r, 'signal', tool1, tool2, 
        peaks1, peaks2, overlap1, overlap2, diff1, diff2,
        d(length1, peaks1), d(length2, peaks2), d(overlap1l, overlap1), d(overlap2l, overlap2), d(diff1l, diff1), d(diff2l, diff2),  
        d(cov1t, peaks1), d(cov2t, peaks2), d(overlap1t, overlap1), d(overlap2t, overlap2), d(diff1t, diff1), d(diff2t, diff2), 
        d(cov1t, length1), d(cov2t, length2), d(overlap1t, overlap1l), d(overlap2t, overlap2l), d(diff1t, diff1l), d(diff2t, diff2l)
      )

    df_comparison_average.loc[len(df_comparison_average)] = (
        m, c, r, 'control', tool1, tool2, 
        peaks1, peaks2, overlap1, overlap2, diff1, diff2,
        d(length1, peaks1), d(length2, peaks2), d(overlap1l, overlap1), d(overlap2l, overlap2), d(diff1l, diff1), d(diff2l, diff2),  
        d(cov1c, peaks1), d(cov2c, peaks2), d(overlap1c, overlap1), d(overlap2c, overlap2), d(diff1c, diff1), d(diff2c, diff2), 
        d(cov1c, length1), d(cov2c, length2), d(overlap1c, overlap1l), d(overlap2c, overlap2l), d(diff1c, diff1l), d(diff2c, diff2l)
      )

#     df_comparison_average.loc[len(df_comparison_average)] = (
#         m, c, r, 'aggregated', tool1, tool2, 
#         peaks1, peaks2, overlap1, overlap2, diff1, diff2,
#         d(length1, peaks1), d(length2, peaks2), d(overlap1l, overlap1), d(overlap2l, overlap2), d(diff1l, diff1), d(diff2l, diff2),  
#         d(cov1a, peaks1), d(cov2a, peaks2), d(overlap1a, overlap1), d(overlap2a, overlap2), d(diff1a, diff1), d(diff2a, diff2), 
#         d(cov1a, length1), d(cov2a, length2), d(overlap1a, overlap1l), d(overlap2a, overlap2l), d(diff1a, diff1l), d(diff2a, diff2l)
#       )

In [None]:
pd.set_option("display.max_columns", None)
display(df_comparison_average.head())
df_comparison_average.to_csv(f'{PATH}/comparison2.csv', index=False)

In [None]:
# df_comparison_average = pd.read_csv(f'{PATH}/comparison2.csv')
# df_comparison_average

## Aggregated for all tools

In [None]:
COLS=dict(
    peaks=['peaks1', 'peaks2', 'diff1', 'diff2'],
    length=['avl1', 'avl2', 'diff_avl1', 'diff_avl2'],
#     coverage=['avcov1', 'avcov2', 'diff_avcov1', 'diff_avcov2'],
    density=['covpbp1', 'covpbp2', 'diff_covpbp1', 'diff_covpbp2']
)

## Per modification

In [None]:
def plot_tools_summary_modification(df_comparison, m):

    plt.figure(figsize=(5 * len(COLS), 6))

    for k, (name, columns) in enumerate(COLS.items()):
        types =  ['signal', 'control'] if name == 'coverage' or name == 'density' else ['signal']
        tts = []
        seen_data = {tt: set([]) for tt in types}
        for i in range(len(tools)):
            for j in range(i + 1, len(tools)):
                tool1 = tools[i]
                tool2 = tools[j]

                for tt in types:
                    ts = df_comparison.loc[(df_comparison['m'] == m) & 
                                           (df_comparison['tool1'] == tool1) & 
                                           (df_comparison['tool2'] == tool2) & 
                                           (df_comparison['coverage'] == tt)].copy()
                    ts['exp'] = [f'{c} {r}' for c, r in zip(ts['c'], ts['r'])]
                    t = ts[['exp'] + columns].copy()
                    t.rename(dict(zip(columns, 
                                      [tool1, tool2, f'diff {tool1} - {tool2}', f'diff {tool2} - {tool1}'])), 
                             axis=1, inplace=True)

                    t = pd.melt(t, id_vars=['exp'], value_vars=list(t.columns).remove('exp'))
                    t['type'] = tt
                    t = t.loc[~(t['variable'].isin(seen_data[tt]))].copy()
                    seen_data[tt].update(t['variable'])
                    tts.append(t)

        t = pd.concat(tts)
        t.sort_values(by=['variable'], inplace=True)
        # Add mean variables
        for var in set(t['variable']):
            tm = t[t["variable"] == var]["value"].mean()
            if name == 'peaks' or name == 'length':
                t.loc[t['variable'] == var, 'variable'] = f'{var} ({int(tm)})'
            else:
                t.loc[t['variable'] == var, 'variable'] = f'{var} ({tm:.3f})'

                
        # Plot 
        ax = plt.subplot(1, len(COLS), k + 1)
        ax.title.set_text(f'{m} {name}')
        if name == 'coverage' or name == 'density':
            sns.boxplot(data=t, x='variable', y='value', hue='type', ax=ax)
        else:
            sns.boxplot(data=t, x='variable', y='value', ax=ax)            
        ax.xaxis.set_tick_params(rotation=90)
        ax.set_ylabel(name)


    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/{m}.png', bbox_inches='tight', dpi=300)        
    plt.show()

In [None]:
for m in MODIFICATIONS:
    print(m)
    plot_tools_summary_modification(df_comparison_average, m)

## Per type

In [None]:
COLS=dict(
    peaks=['peaks1', 'peaks2', 'diff1', 'diff2'],
    length=['avl1', 'avl2', 'diff_avl1', 'diff_avl2'],
    coverage=['avcov1', 'avcov2', 'diff_avcov1', 'diff_avcov2'],
    density=['covpbp1', 'covpbp2', 'diff_covpbp1', 'diff_covpbp2']
)

In [None]:
def plot_tools_summary_type(df_comparison, title, name):
    "Columns should contain tool1, tool2, diff1, diff2 column names"
    columns = COLS[name]
    types =  ['signal', 'control'] if name == 'coverage' or name == 'density' else ['signal']
    plt.figure(figsize=(5 * len(MODIFICATIONS), 6))

    for k, m in enumerate(MODIFICATIONS):

        tts = []
        seen_data = {tt: set([]) for tt in types}
        for i in range(len(tools)):
            for j in range(i + 1, len(tools)):
                tool1 = tools[i]
                tool2 = tools[j]

                for tt in types:
                    ts = df_comparison.loc[(df_comparison['m'] == m) & 
                                           (df_comparison['tool1'] == tool1) & 
                                           (df_comparison['tool2'] == tool2) & 
                                           (df_comparison['coverage'] == tt)].copy()
                    ts['exp'] = [f'{c} {r}' for c, r in zip(ts['c'], ts['r'])]
                    t = ts[['exp'] + columns].copy()
                    t.rename(dict(zip(columns, 
                                      [tool1, tool2, f'diff {tool1} - {tool2}', f'diff {tool2} - {tool1}'])), 
                             axis=1, inplace=True)

                    t = pd.melt(t, id_vars=['exp'], value_vars=list(t.columns).remove('exp'))
                    t['type'] = tt
                    t = t.loc[~(t['variable'].isin(seen_data[tt]))].copy()
                    seen_data[tt].update(t['variable'])
                    tts.append(t)

        t = pd.concat(tts)
        t.sort_values(by=['variable'], inplace=True)
        # Add mean variables
        for var in set(t['variable']):
            tm = t[t["variable"] == var]["value"].mean()
            if name == 'peaks' or name == 'length':
                t.loc[t['variable'] == var, 'variable'] = f'{var} ({int(tm)})'
            else:
                t.loc[t['variable'] == var, 'variable'] = f'{var} ({tm:.3f})'                
        # Plot 
        ax = plt.subplot(1, len(MODIFICATIONS), k + 1)
        ax.title.set_text(m)
        if name == 'coverage' or name == 'density':
            sns.boxplot(data=t, x='variable', y='value', hue='type', ax=ax)
        else:
            sns.boxplot(data=t, x='variable', y='value', ax=ax)            
        ax.xaxis.set_tick_params(rotation=90)
        ax.set_ylabel(title)


    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/{name}.png', bbox_inches='tight', dpi=300)        
    plt.show()

In [None]:
print('Peaks number')
plot_tools_summary_type(df_comparison_average, 'Peaks number', 'peaks')
print('Peaks length')
plot_tools_summary_type(df_comparison_average, 'Peaks length', 'length')
print('Coverage')
plot_tools_summary_type(df_comparison_average, 'Coverage', 'coverage')
print('Density')
plot_tools_summary_type(df_comparison_average, 'Density', 'density')

### Pairwise peaks number venn diagrams

In [None]:
from matplotlib_venn import venn2, venn2_circles


for m in MODIFICATIONS:
    print(m)
    plt.figure(figsize=(20, 3 * len(CELLS)))
    k = 1
    tm = df_comparison.loc[(df_comparison['modification'] == m)].copy()
    tm.sort_values(by=['cell', 'tool1', 'tool2', 'replicate'], inplace=True)
    for _, row in tqdm(tm.iterrows()):
        c, r, tool1, tool2, peaks1, peaks2, overlap1, overlap2 = \
            row['cell'], row['replicate'], row['tool1'], row['tool2'], \
            row['peaks1'], row['peaks2'], row['overlap1'], row['overlap2']
        if not (tool1 == 'SPAN' or tool2 == 'SPAN'):
            continue

        # Tweak subsets to avoid empty areas
        subsets = (max(0.15 * max(peaks1, peaks2), peaks1 - overlap1),
                  max(0.15 * max(peaks1, peaks2), peaks2 - overlap2),
                  max(0.3 * max(peaks1, peaks2), overlap1, overlap2))
        # Plot 
        ax = plt.subplot(len(CELLS), 6, k)
        k += 1

        v = venn2(subsets=subsets,set_labels=(tool1, tool2), set_colors=('r', 'g'), alpha = 0.5, ax=ax)
        venn2_circles(subsets=subsets, color='grey')
        v.get_label_by_id('10').set_text(str(peaks1 - overlap1))
        v.get_label_by_id('01').set_text(str(peaks2 - overlap2))
        if overlap1 != 0 and overlap2 != 0:
            prop = f'{overlap1/overlap2:.1f} : 1' if overlap1 > overlap2 else f'1 : {overlap2/overlap1:.1f}'
            prop = prop.replace('.0', '')  # Cosmetics
        else:
            prop = ''
        v.get_label_by_id('11').set_text(f'{overlap1} : {overlap2}\n{prop}')
        ax.title.set_text(f'{m} {c} {r}')
        
    plt.tight_layout()    
    plt.savefig(f'{PATH}/figures/venn_{m}.png', bbox_inches='tight', dpi=300)        
    plt.show()

# Detailed analysis

In [None]:
df_no = df_peaks.loc[~(df_peaks['name'].str.startswith('overlap'))]
df_no.head()

### Detailed peaks information for each modification, cell line and replicates

In [None]:
for m, c, r in tqdm(product(MODIFICATIONS, CELLS, REPS)):
    print(m, c, r)
    ts = df_no.loc[(df_no['cell'] == c) & (df_no['modification'] == m) & (df_no['replicate'] == r)]
    if len(ts) == 0:
        continue
    
    # Plot 
    plt.figure(figsize=(15, 6))

    t = ts.loc[ts['type'] == 'length'][['name', 'value']].copy()
    t.sort_values(by=['name'], inplace=True)
    t['value'].clip(upper=20000, inplace=True)
    ax = plt.subplot(1, 3, 1)
    sns.boxplot(data=t, x='name', y='value', ax=ax)
    ax.title.set_text(f'{c} {m} {r} Length peaks')
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_ylabel('Length')

    t = ts[(ts['type'] != 'length') & (~(ts['type'].str.contains('pbp')))].copy()
    t.sort_values(by=['name', 'type'], inplace=True)
    t['value'].clip(upper=50, inplace=True)
    ax = plt.subplot(1, 3, 2)
    sns.boxplot(data=t, x='name', hue='type', y='value', ax=ax)
    ax.title.set_text(f'{c} {m} {r} Coverage in peaks')
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_ylabel('RPM')


    t = ts[ts['type'].str.contains('pbp')].copy()
    t.sort_values(by=['name', 'type'], inplace=True)
    t['value'].clip(upper=10.0, inplace=True)      
    ax = plt.subplot(1, 3, 3)
    sns.boxplot(data=t, x='name', hue='type', y='value', ax=ax)
    ax.title.set_text(f'{c} {m} {r} Density in peaks')
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_ylabel('RPKM')

    plt.tight_layout()    
    plt.savefig(f'{PATH}/figures/peaks_{m}_{c}_{r}.png', bbox_inches='tight', dpi=300)        
    plt.show()

# Peaks length and density distributions analysis

In [None]:
print('Length distribution')
for m in MODIFICATIONS:
    print(m)
    ts = df_no.loc[(df_no['modification'] == m) & (df_no['type'] == 'length')]
    plt.figure(figsize=(14, 6))
    k = 1
    for i in tqdm(range(len(tools))):
        for j in range(i + 1, len(tools)):
            tool1 = tools[i]
            tool2 = tools[j]
            if not (tool1 == 'SPAN' or tool2 == 'SPAN'):
                continue

            tst = ts[(ts['name'].str.startswith(f'{tool1} (')) | 
                     (ts['name'].str.startswith(f'{tool2} (')) | 
                     (ts['name'].str.contains(f'coverage {tool1} (', regex=False)) | 
                     (ts['name'].str.contains(f'coverage {tool2} (', regex=False)) | 
                     (ts['name'].str.contains(f'diff {tool1} - {tool2} (', regex=False)) | 
                     (ts['name'].str.contains(f'diff {tool2} - {tool1} (', regex=False))].copy()
            tst['name'] = [re.sub(' \(.*', '', n) for n in tst['name']]

            # Plot             
            tst.sort_values(by=['name', 'type'], inplace=True)
            tst.rename(dict(value='Length'), axis=1, inplace=True)
            ax = plt.subplot(2, 3, k)
            sns.histplot(data=tst.loc[tst['Length'] <= 10000], x='Length', hue='name', 
                         stat='density', kde=True, ax=ax)
            ax.title.set_text(f'{m} {tool1} vs {tool2} l<=10k')
            ax = plt.subplot(2, 3, k + 3)
            sns.histplot(data=tst.loc[tst['Length'] > 10000], x='Length', hue='name', 
                         log_scale=True, stat='density', kde=True, ax=ax)
            ax.title.set_text(f'{m} {tool1} vs {tool2} l>10k')
            k += 1

    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/hist_length_{m}.png', bbox_inches='tight', dpi=300) 
    plt.show()

In [None]:
print('Density distribution')
for m in MODIFICATIONS:
    print(m)
    ts = df_no.loc[np.logical_and(df_no['modification'] == m, 
                                  ['pbp' in ty for ty in df_no['type']])]
    plt.figure(figsize=(14, 4))
    k = 1
    for i in tqdm(range(len(tools))):
        for j in range(i + 1, len(tools)):
            tool1 = tools[i]
            tool2 = tools[j]
            if not (tool1 == 'SPAN' or tool2 == 'SPAN'):
                continue

            tst = ts[(ts['name'].str.startswith(f'{tool1} (')) | 
                     (ts['name'].str.startswith(f'{tool2} (')) | 
                     (ts['name'].str.contains(f'coverage {tool1} (', regex=False)) | 
                     (ts['name'].str.contains(f'coverage {tool2} (', regex=False)) | 
                     (ts['name'].str.contains(f'diff {tool1} - {tool2} (', regex=False)) | 
                     (ts['name'].str.contains(f'diff {tool2} - {tool1} (', regex=False))].copy()
            tst['name'] = [re.sub(' \(.*', '', n) for n in tst['name']]
            # Plot             

            tst.sort_values(by=['name', 'type'], inplace=True)
            tst['value'].clip(upper=10.0, inplace=True)  # For visualizatioin
            tst.rename(dict(value='RPKM'), axis=1, inplace=True)
            ax = plt.subplot(1, 3, k)
            k += 1
            sns.histplot(data=tst, x='RPKM', hue='name', stat='density', kde=True, ax=ax)
            ax.title.set_text(f'{m} {tool1} vs {tool2}')

    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/hist_density_{m}.png', bbox_inches='tight', dpi=300) 
    plt.show()

### Peaks density vs length scatterplot

In [None]:
import random

for m in MODIFICATIONS:
    print(m)
    ts = df_no.loc[
        (df_no['modification'] == m) & ((df_no['type'] == 'signal pbp') | (df_no['type'] == 'length'))
    ].copy()
    ts['name'] = [re.sub(' \(.*', '', n) for n in ts['name']]
    plt.figure(figsize=(14, 4))
    k = 1    
    for i in tqdm(range(len(tools))):
        for j in range(i + 1, len(tools)):
            tool1 = tools[i]
            tool2 = tools[j]
            if not (tool1 == 'SPAN' or tool2 == 'SPAN'):
                continue

            # Use stratified sampling for performance reasons
            tss = []            
            for name in [tool1, tool2, f'coverage {tool1}', f'coverage {tool2}']:
                tss.append(ts[ts['name'] == name].sample(100000))

            # Plot
            tst = pd.concat(tss)
            t_len = tst[tst['type'] == 'length']
            t_rpkm = tst[tst['type'] != 'length']
            ax = plt.subplot(1, 3, k)
            k += 1
            sns.kdeplot(x=t_len['value'].clip(upper=20000).values,
                        y=t_rpkm['value'].clip(upper=30.0).values,
                        hue=t_rpkm['name'].values,
                        fill=True, alpha=0.4, ax=ax)
            ax.title.set_text(f'{m} {tool1} vs {tool2}')
            ax.set_xlabel('Length')
            ax.set_ylabel('RPKM')

    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/density_vs_len_{m}.png', bbox_inches='tight', dpi=300) 
    plt.show