# 2021 - GSE26320 peaks coverage

1. Copy `*.bam` for treatments and inputs to `<FOLDER>/bams/`
2. Copy peaks into `<FOLDER>/peaks/`

    
```
FOLDER=/mnt/stripe/shpynov/2021_GSE26320_GM12878_H3K4me3
cd $FOLDER

cd bam
# BED to tags
SHIFT=125
for BAM in *Input*.bam; do
   echo ${BAM};
   bedtools bamtobed -i ${BAM} |\
      awk -v OFS='\t' -v S=${SHIFT} \
      '{if ($6 != "-") {print($1, $2+S, $2+S+1)} else {if ($3-S>=1) {print($1, $3-S, $3-S+1)}}}' |\
      sort -u -k1,1 -k3,3n -k2,2n > ${BAM/.bam/.tags}
done

mkdir ../tags
mv *.tags ../tags

cd ../peaks
for F in *.narrowPeak; do
    echo $F;
    cat $F | awk -v OFS='\t' '{print $1,$2,$3,$5}' | sort -k1,1 -k2,2n > $F.bed4;
done;
mkdir ../bed4
mv *.bed4 ../bed4/

# Add coverage information to peaks
mkdir ../covs
cd ../bed4
CELLS=$(ls *.bed4 | sed -E 's/GSM[0-9]+_//g' | sed 's/_.*//g' | sort --unique);
for C in ${CELLS[@]}; do
   echo $C;
   for M in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3; do
       for R in rep1 rep2; do
          echo $R;
          for F in *${C}_${M}_${R}*.bed4; do
             echo $F;
             echo $(ls ../tags/*${C}_${M}_${R}*.tags);
             echo $(ls ../tags/*${C}_Input_${R}*.tags);
             bedtools intersect -a $F -b ../tags/*${C}_${M}_${R}*.tags -wa -c > ../covs/${F}t; 
             bedtools intersect -a ../covs/${F}t -b ../tags/*${C}_Input_${R}*.tags -wa -c > ../covs/${F}tc; 
             rm ../covs/${F}t; 
          done
       done
   done
done
```

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import subprocess, tempfile
from itertools import product
import plotly.graph_objects as go
import plotly.express as px

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return [] # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

def join(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return ','.join(f'{c}:{s}-{e}' for c, s, e in zip(tf[0], tf[1], tf[2]))
    except:
        return '' # Empty file

def rank_correlation(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return spearmanr(tf[3], tf[7])[0]
    except:
        return 0 # Empty file
    
def d(a, b):
    return a / b if b != 0 else 0


In [None]:
PATH = '/mnt/stripe/shpynov/2021_GSE26320'
! mkdir {PATH}/figures

# Don't have H1 H3K27ac rep2
# Don't have Huvec H3K4me3 rep1 
# Don't have HepG2 H3K4me1 rep2
CELLS = ['GM12878', 'HMEC', 'HSMM', 'K562', 'NHEK', 'NHLF']  # Ignore H1, Huvec, HepG2

MODIFICATIONS = ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']

REPS = ['rep1', 'rep2']

FILES = os.listdir(os.path.join(PATH, 'covs'))


reps = []
tools = []
cells = []
mods = []
files = []

for f in FILES:
    rep, tool, cell, mod = None, None, None, None
    if 'rep1_rep2' in f:
        rep = 'rep1_rep2'  # ENCODE replicated peaks support
    else:
        rep = 'rep1' if 'rep1' in f else 'rep2'
    
    
    if 'rep1_rep2' in f:
        tool = 'ENCODE rep'
    elif 'ENCODE' in f:
        tool = 'ENCODE'
    elif '5.peak' in f:
        tool = 'SPAN'
    elif 'FDR' in f:
        tool = 'SICER'
    elif 'narrowPeak' in f:
        tool = 'MACS2'
    elif 'broadPeak' in f:
        tool = 'MACS2 broad'

    for cc in CELLS:
        if cc in f:
            cell = cc
            break
    for m in MODIFICATIONS:
        if m in f:
            mod = m
            break


    if rep and tool and cell and mod:
        reps.append(rep)
        tools.append(tool)
        cells.append(cell)
        mods.append(mod)
        files.append(f'{PATH}/covs/{f}')
        
df = pd.DataFrame(dict(cell=cells, modification=mods, rep=reps, file=files, tool=tools))
df

In [None]:
coveraget = {}
coveragec = {}

for m, c, r in product(MODIFICATIONS, CELLS, REPS):
    print(m, c, r)
    coveraget[(m, c, r)] = \
        lines(f'{PATH}/tags/' + next((f for f in os.listdir(f'{PATH}/tags/') if f'{c}_{m}_{r}' in f), None))
    coveragec[(c, r)] = \
        lines(f'{PATH}/tags/' + next((f for f in os.listdir(f'{PATH}/tags/') if f'{c}_Input_{r}' in f), None))

display(coveraget)
display(coveragec)

In [None]:
peaks = []
lengths = []
avlengths = []

for f in df['file']: 
    ps, ls = lines(f), bedl(f)
    avls = 0 if ps == 0 else sum(ls) / ps
    peaks.append(ps)
    lengths.append(sum(ls))
    avlengths.append(avls)

df['peaks'] = peaks
df['length'] = lengths
df['average length'] = avlengths
df

### Compute detailed peaks information

In [None]:
def coverage_t(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return [] # Empty file

def coverage_c(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[5]
    except:
        return [] # Empty file

def coverage_a(file, cell, replicate):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return (tf[4] - tf[5] * min(1, coveraget[(cell, replicate)] / coveragec[(cell, replicate)])).clip(lower=0)
    except:
        return [] # Empty file

def zip6(v1, v2, v3, v4, v5, arr):
    return [(v1, v2, v3, v4, v5, a) for a in arr]


comparison = pd.DataFrame(columns=[
    'modification', 'cell', 'replicate', 'tool1', 'tool2', 
    'peaks1', 'peaks2', 'length1', 'length2',
    'overlap1', 'overlap2', 'overlap1l', 'overlap2l', 
    'diff1', 'diff2', 'diff1l', 'diff2l',
    'cov1t', 'cov1c', 'cov1a', 
    'cov2t', 'cov2c', 'cov2a',
    'overlap1t', 'overlap1c', 'overlap1a',
    'overlap2t', 'overlap2c', 'overlap2a',                                      
    'diff1t', 'diff1c', 'diff1a',
    'diff2t', 'diff2c', 'diff2a'], dtype=object)

tools = list(sorted(set(df['tool'])))

df_peaks = []
for m, c, r in product(MODIFICATIONS, CELLS, REPS):
    print(m, c, r)

    covtm = coveraget[(m, c, r)] * 1e-6
    covcm = coveragec[(c, r)] * 1e-6

    for i in tqdm(range(len(tools))):
        tool1 = tools[i]
        t1 = df.loc[(df['cell'] == c) & (df['modification'] == m) & 
                    (df['rep'].str.contains(r)) & (df['tool']==tool1)]

        file1 = t1['file'].values[0]
#         print('Tool1', tool1, file1)

        peaks1, length1 = lines(file1), bedl(file1)
        df_peaks.extend(zip6(m, c, r, f'{tool1} ({peaks1})', 'length', length1))            
        cov1t, cov1c, cov1a = coverage_t(file1), coverage_c(file1), coverage_a(file1, c, r)
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'signal', 
                       [co / covtm for co in cov1t]))
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'control', 
                       [co / covcm for co in cov1c]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'aggregated', cov1a))
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'signal pbp', 
                       [d(co / covtm, l * 1e-3) for co, l in zip(cov1t, length1)]))
        df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'control pbp', 
                       [d(co / covcm, l * 1e-3) for co, l in zip(cov1c, length1)]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool1} ({peaks1})', 'aggregated pbp', [d(co, l) for co, l in zip(cov1a, length1)]))

            
        
        for j in range(i + 1, len(tools)):
            tool2 = tools[j]
            t2 = df.loc[(df['cell'] == c) & (df['modification'] == m) & 
                        (df['rep'].str.contains(r)) & (df['tool']==tool2)]

            file2 = t2['file'].values[0]
#             print('Tool2', tool2, file2)

            peaks2, length2 = lines(file2), bedl(file2)
            df_peaks.extend(zip6(m, c, r, f'{tool2} ({peaks2})', 'length', length2))
            cov2t, cov2c, cov2a = coverage_t(file2), coverage_c(file2), coverage_a(file2, c, r)
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'signal', 
                           [co / covtm for co in cov2t]))
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'control', 
                           [co / covcm for co in cov1c]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'aggregated', cov2a))
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(cov2t, length2)]))
            df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(cov2c, length2)]))
#                 df_peaks.extend(zip6(m, c, r, f'coverage {tool2} ({peaks2})', 'aggregated pbp', [d(co, l) for co, l in zip(cov2a, length2)]))


            tf = f'{PATH}/{r}_overlap_{tool1}_vs_{tool2}.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {file2} -wa -u > {tf}
            overlap1, overlap1l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'length', overlap1l))            
            overlap1t, overlap1c, overlap1a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'signal', 
                           [co / covtm for co in overlap1t]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'control', 
                           [co / covcm for co in overlap1c]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'aggregated', overlap1a))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(overlap1t, overlap1l)]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(overlap1c, overlap1l)]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool1} vs {tool2} ({overlap1})', 'aggregated pbp', [d(co, l) for co, l in zip(overlap1a, overlap1l)]))


            tf = f'{PATH}/{r}_overlap_{tool2}_vs_{tool1}.bed'.replace(' ', '_')
            !bedtools intersect -b {file1} -a {file2} -wa -u > {tf}
            overlap2, overlap2l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'length', overlap2l))                        
            overlap2t, overlap2c, overlap2a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'signal', 
                           [co / covtm for co in overlap2t]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'control', 
                           [co / covcm for co in overlap2c]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'aggregated', overlap2a))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(overlap2t, overlap2l)]))
            df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(overlap2c, overlap2l)]))
#                 df_peaks.extend(zip6(m, c, r, f'overlap {tool2} vs {tool1} ({overlap2})', 'aggregated pbp', [d(co, l) for co, l in zip(overlap2a, overlap2l)]))


            tf = f'{PATH}/{r}_diff_{tool1}_vs_{tool2}.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {file2} -v > {tf}
            diff1, diff1l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'length', diff1l))                        
            diff1t, diff1c, diff1a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'signal', 
                           [co / covtm for co in diff1t]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'control', 
                           [co / covcm for co in diff1c]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'aggregated', diff1a))
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(diff1t, diff1l)]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(diff1c, diff1l)]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool1} - {tool2} ({diff1})', 'aggregated pbp', [d(co, l) for co, l in zip(diff1a, diff1l)]))

            tf = f'{PATH}/{r}_diff_{tool2}_vs_{tool1}.bed'.replace(' ', '_')        
            !bedtools intersect -b {file1} -a {file2} -v > {tf}
            diff2, diff2l = lines(tf), bedl(tf)
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'length', diff2l))            
            diff2t, diff2c, diff2a = coverage_t(tf), coverage_c(tf), coverage_a(tf, c, r)
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'signal', 
                           [co / covtm for co in diff2t]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'control', 
                           [co / covcm for co in diff2c]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'aggregated', diff2a))
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'signal pbp', 
                           [d(co / covtm, l * 1e-3) for co, l in zip(diff2t, diff2l)]))
            df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'control pbp', 
                           [d(co / covcm, l * 1e-3) for co, l in zip(diff2c, diff2l)]))
#                 df_peaks.extend(zip6(m, c, r, f'diff {tool2} - {tool1} ({diff2})', 'aggregated pbp', [d(co, l) for co, l in zip(diff2a, diff2l)]))

            comparison.loc[len(comparison)] = (
                m, c, r, tool1, tool2, 
                peaks1, peaks2, sum(length1), sum(length2), 
                overlap1, overlap2, sum(overlap1l), sum(overlap2l),
                diff1, diff2, sum(diff1l), sum(diff2l),
                sum(cov1t), sum(cov1c), sum(cov1a), 
                sum(cov2t), sum(cov2c), sum(cov2a), 
                sum(overlap1t), sum(overlap1c), sum(overlap1a),
                sum(overlap2t), sum(overlap2c), sum(overlap2a),
                sum(diff1t), sum(diff1c), sum(diff1a),
                sum(diff2t), sum(diff2c), sum(diff2a)
            )
            
            
df_peaks = pd.DataFrame(df_peaks, columns=['modification', 'cell', 'replicate', 'name', 'type', 'value'])
df_peaks.sort_values(by=['name'], inplace=True)

display(df_peaks.head())
display(comparison.head())
comparison.to_csv(f'{PATH}/comparison.csv', index=False)

In [None]:
print('Agggregate peaks, length, coverage and density information')
comparison2 = pd.DataFrame(columns=['m', 'c', 'r', 'coverage', 'tool1', 'tool2', 
      'peaks1', 'peaks2', 'overlap1', 'overlap2', 'diff1', 'diff2',
      'avl1', 'avl2', 'ov_avl1', 'ov_avl2', 'diff_avl1', 'diff_avl2', 
      'avcov1', 'avcov2', 'ov_avcov1', 'ov_avcov2', 'diff_avcov1', 'diff_avcov2', 
      'covpbp1', 'covpbp2', 'ov_covpbp1', 'ov_covpbp2', 'diff_covpbp1', 'diff_covpbp2'], dtype=object)

for i, row in tqdm(comparison.iterrows()):
    (m, c, r, tool1, tool2, 
     peaks1, peaks2, length1, length2, 
     overlap1, overlap2, overlap1l, overlap2l,
     diff1, diff2, diff1l, diff2l,
     cov1t, cov1c, cov1a, 
     cov2t, cov2c, cov2a, 
     overlap1t, overlap1c, overlap1a,
     overlap2t, overlap2c, overlap2a,
     diff1t, diff1c, diff1a,
     diff2t, diff2c, diff2a) = row    
    comparison2.loc[len(comparison2)] = (
        m, c, r, 'signal', tool1, tool2, 
        peaks1, peaks2, overlap1, overlap2, diff1, diff2,
        d(length1, peaks1), d(length2, peaks2), d(overlap1l, overlap1), d(overlap2l, overlap2), d(diff1l, diff1), d(diff2l, diff2),  
        d(cov1t, peaks1), d(cov2t, peaks2), d(overlap1t, overlap1), d(overlap2t, overlap2), d(diff1t, diff1), d(diff2t, diff2), 
        d(cov1t, length1), d(cov2t, length2), d(overlap1t, overlap1l), d(overlap2t, overlap2l), d(diff1t, diff1l), d(diff2t, diff2l)
      )

    comparison2.loc[len(comparison2)] = (
        m, c, r, 'control', tool1, tool2, 
        peaks1, peaks2, overlap1, overlap2, diff1, diff2,
        d(length1, peaks1), d(length2, peaks2), d(overlap1l, overlap1), d(overlap2l, overlap2), d(diff1l, diff1), d(diff2l, diff2),  
        d(cov1c, peaks1), d(cov2c, peaks2), d(overlap1c, overlap1), d(overlap2c, overlap2), d(diff1c, diff1), d(diff2c, diff2), 
        d(cov1c, length1), d(cov2c, length2), d(overlap1c, overlap1l), d(overlap2c, overlap2l), d(diff1c, diff1l), d(diff2c, diff2l)
      )

#     comparison2.loc[len(comparison2)] = (
#         m, c, r, 'aggregated', tool1, tool2, 
#         peaks1, peaks2, overlap1, overlap2, diff1, diff2,
#         d(length1, peaks1), d(length2, peaks2), d(overlap1l, overlap1), d(overlap2l, overlap2), d(diff1l, diff1), d(diff2l, diff2),  
#         d(cov1a, peaks1), d(cov2a, peaks2), d(overlap1a, overlap1), d(overlap2a, overlap2), d(diff1a, diff1), d(diff2a, diff2), 
#         d(cov1a, length1), d(cov2a, length2), d(overlap1a, overlap1l), d(overlap2a, overlap2l), d(diff1a, diff1l), d(diff2a, diff2l)
#       )

In [None]:
pd.set_option("display.max_columns", None)
display(comparison2.head())
comparison2.to_csv(f'{PATH}/comparison2.csv', index=False)

### Pairwise comparisons

In [None]:
# Pairwise tools info
def pairwise_info(name, fname, columns):
    for i in tqdm(range(len(tools))):
        for j in range(i + 1, len(tools)):
            tool1 = tools[i]
            tool2 = tools[j]
            print(f'{tool1} vs {tool2}')
            plt.figure(figsize=(4 * len(MODIFICATIONS), 6))

            for k, m in enumerate(MODIFICATIONS):
                ts = comparison2.loc[(comparison2['m'] == m) & (comparison2['tool1'] == tool1) & 
                                     (comparison2['tool2'] == tool2) & 
                                     (comparison2['coverage'] == 'signal')].copy()
                ts['exp'] = [f'{c} {r}' for c, r in zip(ts['c'], ts['r'])]
                t = ts[['exp'] + columns].copy()
                t.rename(dict(zip(columns, 
                                  [tool1, tool2, 
                                   f'overlap {tool1} vs {tool2}', f'overlap {tool2} vs {tool1}',
                                   f'diff {tool1} - {tool2}', f'diff {tool2} - {tool1}'])), 
                         axis=1, inplace=True)

                t = pd.melt(t, id_vars=['exp'], value_vars=list(t.columns).remove('exp'))
                
                # Plot 
                ax = plt.subplot(1, len(MODIFICATIONS), k + 1)
                ax.title.set_text(m)
                sns.boxplot(data=t, x='variable', y='value', ax=ax)
                ax.xaxis.set_tick_params(rotation=90)
                ax.set_ylabel(name)


            print(f'{name} {tool1} vs {tool2}')
            plt.tight_layout()
            plt.savefig(f'{PATH}/figures/{fname}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()

In [None]:
pairwise_info('Peaks number', 'peaks', ['peaks1', 'peaks2', 'overlap1', 'overlap2', 'diff1', 'diff2'])

In [None]:
pairwise_info('Peaks length', 'length', ['avl1', 'avl2', 'ov_avl1', 'ov_avl2', 'diff_avl1', 'diff_avl2'])

In [None]:
pairwise_info('Coverage', 'coverage', ['avcov1', 'avcov2', 'ov_avcov1', 'ov_avcov2', 'diff_avcov1', 'diff_avcov2'])

In [None]:
pairwise_info('Density', 'density', ['covpbp1', 'covpbp2', 'ov_covpbp1', 'ov_covpbp2', 'diff_covpbp1', 'diff_covpbp2'])

### Pairwise peaks number venn diagrams

In [None]:
from matplotlib_venn import venn2, venn2_circles

for _, row in tqdm(comparison.iterrows()):
    m, c, r, tool1, tool2, peaks1, peaks2, overlap1, overlap2 = \
        row['cell'], row['modification'], row['replicate'], \
        row['tool1'], row['tool2'], row['peaks1'], row['peaks2'], row['overlap1'], row['overlap2']
    print(c, m , r, f'{tool1} vs {tool2}')
    if peaks1 == 0 and peaks2 == 0:
        print('No peaks')
        continue
        
    # Tweak subsets to avoid empty areas
    subsets = (max(0.15 * max(peaks1, peaks2), peaks1 - overlap1),
              max(0.15 * max(peaks1, peaks2), peaks2 - overlap2),
              max(0.3 * max(peaks1, peaks2), overlap1, overlap2))
    v = venn2(subsets=subsets,set_labels=(tool1, tool2), set_colors=('r', 'g'), alpha = 0.5)
    venn2_circles(subsets=subsets, color='grey')
    v.get_label_by_id('10').set_text(str(peaks1 - overlap1))
    v.get_label_by_id('01').set_text(str(peaks2 - overlap2))
    if overlap1 != 0 and overlap2 != 0:
        prop = f'{overlap1/overlap2:.1f} : 1' if overlap1 > overlap2 else f'1 : {overlap2/overlap1:.1f}'
        prop = prop.replace('.0', '')  # Cosmetics
    else:
        prop = ''
    v.get_label_by_id('11').set_text(f'{overlap1} : {overlap2}\n{prop}')
    
    plt.title(f'{c} {m} {r} Venn')
    plt.tight_layout()    
    plt.savefig(f'{PATH}/figures/venn_{m}_{c}_{r}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
    plt.show()

### Pairwise tools detailed comparison per cell and replicates

In [None]:
MODIFICATIONS = ['H3K4me3']

In [None]:
for m, c, r in tqdm(product(MODIFICATIONS, CELLS, REPS)):
    ts = df_peaks.loc[(df_peaks['cell'] == c) & (df_peaks['modification'] == m) & (df_peaks['replicate'] == r)]
    ts = ts[[not nm.startswith('overlap') for nm in ts['name']]].copy()
    for i in range(len(tools)):
        for j in range(i + 1, len(tools)):
            tool1 = tools[i]
            tool2 = tools[j]
            print(m, c, r, f'{tool1} vs {tool2}')
            tst = ts[[n.startswith(f'{tool1} (') or n.startswith(f'{tool2} (') or 
                      f'coverage {tool1} (' in n or f'coverage {tool2} (' in n or 
                      f'diff {tool1} - {tool2} (' in n or f'diff {tool2} - {tool1} (' in n 
                      for n in ts['name']]]
            if len(tst) == 0:
                continue

            # Plot 
            
            plt.figure(figsize=(8, 6))
            t = tst.loc[tst['type'] == 'length'][['name', 'value']].copy()
            t['value'].clip(upper=10000, inplace=True)
            sns.boxplot(data=t, x='name', y='value')
            plt.xticks(rotation=90)
            plt.ylabel('Length')
            plt.title(f'{c} {m} {r} Length peaks')
            plt.tight_layout()
            plt.savefig(f'{PATH}/figures/lenght_{m}_{c}_{r}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()

            plt.figure(figsize=(10, 6))
            t = tst[[ty != 'length' and 'pbp' not in ty for ty in tst['type']]].copy()
            t.sort_values(by=['name', 'type'], inplace=True)
            t['value'].clip(upper=50, inplace=True)
            sns.boxplot(data=t, x='name', hue='type', y='value')
            plt.xticks(rotation=90)
            plt.ylabel('RPM')
            plt.title(f'{c} {m} {r} Coverage in peaks')
            plt.tight_layout()
            plt.savefig(f'{PATH}/figures/coverage_{m}_{c}_{r}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()

            plt.figure(figsize=(10, 6))
            t = tst[[ty != 'length' and 'pbp' in ty for ty in tst['type']]].copy()
            t.sort_values(by=['name', 'type'], inplace=True)
            t['value'].clip(upper=10.0, inplace=True)      
            sns.boxplot(data=t, x='name', hue='type', y='value')
            plt.xticks(rotation=90)
            plt.ylabel('RPKM')
            plt.title(f'{c} {m} {r} Coverage per basepair in peaks')
            plt.tight_layout()    
            plt.savefig(f'{PATH}/figures/density_{m}_{c}_{r}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()

### Aggregated info for all tools per modification, cells and replicates

In [None]:
for m, c, r in tqdm(product(MODIFICATIONS, CELLS, REPS)):
    print(m, c, r)
    ts = df_peaks.loc[(df_peaks['cell'] == c) & (df_peaks['modification'] == m) & (df_peaks['replicate'] == r)]
    ts = ts[[not nm.startswith('overlap') for nm in ts['name']]].copy()

    if len(ts) == 0:
        continue
    
    # Plot 
    plt.figure(figsize=(15, 7))
    t = ts.loc[ts['type'] == 'length'][['name', 'value']].copy()
    t['value'].clip(upper=10000, inplace=True)
    sns.boxplot(data=t, x='name', y='value')
    plt.xticks(rotation=90)
    plt.ylabel('Length')
    plt.title(f'{c} {m} {r} Length peaks')
    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/length_{m}_{c}_{r}.png', bbox_inches='tight', dpi=300)        
    plt.show()

    plt.figure(figsize=(15, 7))
    t = ts[[ty != 'length' and 'pbp' not in ty for ty in ts['type']]].copy()
    t.sort_values(by=['name', 'type'], inplace=True)
    t['value'].clip(upper=50, inplace=True)
    sns.boxplot(data=t, x='name', hue='type', y='value')
    plt.xticks(rotation=90)
    plt.ylabel('RPM')
    plt.title(f'{c} {m} {r} Coverage in peaks')
    plt.tight_layout()
    plt.savefig(f'{PATH}/figures/coverage_{m}_{c}_{r}.png', bbox_inches='tight', dpi=300)        
    plt.show()

    plt.figure(figsize=(15, 7))
    t = ts[[ty != 'length' and 'pbp' in ty for ty in ts['type']]].copy()
    t.sort_values(by=['name', 'type'], inplace=True)
    t['value'].clip(upper=10.0, inplace=True)      
    sns.boxplot(data=t, x='name', hue='type', y='value')
    plt.xticks(rotation=90)
    plt.ylabel('RPKM')
    plt.title(f'{c} {m} {r} Coverage per basepair in peaks')
    plt.tight_layout()    
    plt.savefig(f'{PATH}/figures/density_{m}_{c}_{r}.png', bbox_inches='tight', dpi=300)        
    plt.show()


### Pairwise tools comparison per modification

In [None]:
import re

# Pairwise tools
for m in tqdm(MODIFICATIONS):
    print(m)
    ts = df_peaks.loc[df_peaks['modification'] == m]
    for i in range(len(tools)):
        for j in range(i + 1, len(tools)):
            tool1 = tools[i]
            tool2 = tools[j]
            print(f'{tool1} vs {tool2}')
            tst = ts[[n.startswith(f'{tool1} (') or n.startswith(f'{tool2} (') or 
                      f'coverage {tool1} (' in n or f'coverage {tool2} (' in n or 
                      f'diff {tool1} - {tool2} (' in n or f'diff {tool2} - {tool1} (' in n 
                      for n in ts['name']]].copy()
            tst['exp'] = [f'{c}_{r}' for c, r in zip(tst['cell'], tst['replicate'])]
            tst['name'] = [re.sub(' \(.*', '', n) for n in tst['name']]
            tst.sort_values(by=['name', 'type', 'exp'], inplace=True)
            print(','.join(sorted(set(tst['name']))))
            
            # Plots

            t = tst.loc[tst['type'] == 'length'].copy()
            t['value'].clip(upper=10000, inplace=True)
            n = len(set(t['name']))
            plt.figure(figsize=(4 * n, 6))
            for k, name in enumerate(sorted(set(t['name']))):
                ax = plt.subplot(1, n, k + 1)
                ax.title.set_text(f'{m} {name}')
                sns.boxplot(data=t[t['name'] == name], x='exp', y='value', ax=ax)
                ax.set_ylim([0, 10000])
                ax.xaxis.set_tick_params(rotation=45)
                ax.set_ylabel('Length')

            plt.tight_layout()
            plt.savefig(f'{PATH}/figures/length_{m}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()


            t = tst[[ty != 'length' and 'pbp' not in ty for ty in tst['type']]].copy()
            t.sort_values(by=['name', 'type'], inplace=True)
            t['value'].clip(upper=50, inplace=True)
            n = len(set(t['name']))
            plt.figure(figsize=(4 * n, 6))
            for k, name in enumerate(sorted(set(t['name']))):
                ax = plt.subplot(1, n, k + 1)
                ax.title.set_text(f'{m} {name}')
                sns.boxplot(data=t[t['name'] == name], x='exp', hue='type', y='value', ax=ax)
                ax.set_ylim([0, 50])
                ax.xaxis.set_tick_params(rotation=45)
                ax.set_ylabel('RPM')

            plt.tight_layout()
            plt.savefig(f'{PATH}/figures/coverage_{m}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()

            t = tst[[ty != 'length' and 'pbp' in ty for ty in tst['type']]].copy()
            t.sort_values(by=['name', 'type'], inplace=True)
            t['value'].clip(upper=20.0, inplace=True)      
            n = len(set(t['name']))
            plt.figure(figsize=(4 * n, 6))
            for k, name in enumerate(sorted(set(t['name']))):
                ax = plt.subplot(1, n, k + 1)
                ax.title.set_text(f'{m} {name}')
                sns.boxplot(data=t[t['name'] == name], x='exp', hue='type', y='value', ax=ax)
                ax.set_ylim([0, 20])                
                ax.xaxis.set_tick_params(rotation=45)
                ax.set_ylabel('RPKM')

            plt.tight_layout()    
            plt.savefig(f'{PATH}/figures/density_{m}_{tool1}_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()