# 2021 Comparative analysis benchmark

Logbook: https://docs.google.com/document/d/1NsqY_mA7U-jY2aLQlaQimiNLaRvYVx26cAia8HTdG_M/edit#


In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os, re
import glob
from tqdm.auto import tqdm
import tempfile
from itertools import product


In [None]:
# Ignore H3K27ac, H3K27ac because of a single replicate, bad quality
MODIFICATIONS = [
    'H3K4ac', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K9ac', 'H3K9me3',
    'H3K27me3', 'H3K36me3', 'H3K56ac', 'H3K79me1', 'H3K79me2'
]
# MODIFICATIONS = ['H3K4me1', 'H3K4me3', 'H3K36me3']

PATH = os.path.expanduser('~/data/2022_comparative_analysis')
FIGURES_DIR = os.path.join(PATH, 'figures')
EXT = 'png'
! mkdir -p {FIGURES_DIR}

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return np.zeros(0) # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file

# MACS2

In [None]:
MACS2_FOLDER=f'{PATH}/macs2'
MACS2_LEVELS = ['q0.05', 'broad0.1']

dfm = pd.DataFrame(columns=['modification', 'replicate', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(MACS2_FOLDER + '/*.*Peak')):
    if 'gapped' in file:
        continue
    level = next((l for l in MACS2_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    if level and modification:
        replicate = os.path.basename(file).replace(f'H1_{modification}_', '')[:3]
        peaks, length = lines(file), bedl(file).sum()
        dfm.loc[len(dfm)] = (modification, replicate, f'macs2 {level}', file, peaks, length)
        
# Fix types
dfm['peaks'] = dfm['peaks'].astype(int)
dfm['length'] = dfm['length'].astype(int)


In [None]:
print('Dataset size')
t = (dfm.groupby(['modification'])['file'].count() / len(MACS2_LEVELS)).astype(int)
plt.figure(figsize=(8, 4))
sns.barplot(data=pd.DataFrame(dict(modification=t.index, replicates=t)), x='modification', y='replicates')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'replicates_macs2.{EXT}'), bbox_inches='tight', dpi=300)
plt.show()

In [None]:
print('Mean peaks')
display(dfm.groupby(['modification', 'level'])['peaks'].mean())
print('Std peaks')
display(dfm.groupby(['modification', 'level'])['peaks'].std())

In [None]:
dfm['av_length'] = [l / p if p != 0 else 0 for p, l in zip(dfm['peaks'], dfm['length'])]       
print('Mean average length')
display(dfm.groupby(['modification', 'level'])['av_length'].mean())
print('Std average length')
display(dfm.groupby(['modification', 'level'])['av_length'].std())

In [None]:
macs2levels2process = {'macs2 q0.05', 'macs2 broad0.1'}

# SICER

In [None]:
SICER_FOLDER=f'{PATH}/sicer'
SICER_LEVELS = ['FDR0.01']

dfsc = pd.DataFrame(columns=['modification', 'replicate', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(SICER_FOLDER + '/*islands-summary*')):
    level = next((l for l in SICER_LEVELS if f'-{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    if level and modification:
        replicate = os.path.basename(file).replace(f'H1_{modification}_', '')[:3]
        peaks, length = lines(file), bedl(file).sum()
        dfsc.loc[len(dfsc)] = (modification, replicate, f'sicer {level}', file, peaks, length)

# Fix types
dfsc['peaks'] = dfsc['peaks'].astype(int)
dfsc['length'] = dfsc['length'].astype(int)

In [None]:
display((dfsc.groupby(['modification', 'level'])['file'].count()).astype(int))

In [None]:
print('Mean peaks')
display(dfsc.groupby(['modification', 'level'])['peaks'].mean())
print('Std peaks')
display(dfsc.groupby(['modification', 'level'])['peaks'].std())

In [None]:
dfsc['av_length'] = [l / p if p != 0 else 0 for p, l in zip(dfsc['peaks'], dfsc['length'])]       
print('Mean average length')
display(dfsc.groupby(['modification', 'level'])['av_length'].mean())
print('Std average length')
display(dfsc.groupby(['modification', 'level'])['av_length'].std())

In [None]:
sicerlevels2process = {'sicer FDR0.01'}

# SPAN

In [None]:
SPAN_FOLDER=f'{PATH}/span'
GAPS = [3]
FDRS = ['0.05']
SPAN_LEVELS = list(f'200_{fdr}_{gap}' for fdr, gap in product(FDRS, GAPS))

dfs = pd.DataFrame(columns=['modification', 'replicate', 'level', 'file', 'peaks', 'length'], dtype=object)
for file in tqdm(glob.glob(SPAN_FOLDER + '/*.peak')):
    if 'Input' in file:
        continue
    level = next((l for l in SPAN_LEVELS if f'_{l}' in file), None)
    modification = next((m for m in MODIFICATIONS if f'_{m}' in file), None)
    if level and modification:
        replicate = os.path.basename(file).replace(f'H1_{modification}_', '')[:3]
        peaks, length = lines(file), bedl(file).sum()
        dfs.loc[len(dfs)] = (modification, replicate, f'span {level}', file, peaks, length)
        
# Fix types
dfs['peaks'] = dfs['peaks'].astype(int)
dfs['length'] = dfs['length'].astype(int)

In [None]:
display((dfs.groupby(['modification', 'level'])['file'].count()).astype(int))

In [None]:
print('Mean peaks')
display(dfs.groupby(['modification', 'level'])['peaks'].mean())
print('Std peaks')
display(dfs.groupby(['modification', 'level'])['peaks'].std())

In [None]:
dfs['av_length'] = [l / p if p != 0 else 0 for p, l in zip(dfs['peaks'], dfs['length'])]       
print('Mean average length')
display(dfs.groupby(['modification', 'level'])['av_length'].mean())
print('Std average length')
display(dfs.groupby(['modification', 'level'])['av_length'].std())

In [None]:
import re
dfs['fdr'] = [float(re.sub('span 200_|_(0|3|5|10)', '', l)) for l in dfs['level']]
dfs['gap'] = [int(re.sub('.*_', '', l)) for l in dfs['level']]
dfs.sort_values(by=['fdr', 'gap'], inplace=True)
dfs.head()

In [None]:
spanlevels2process = {'span 200_0.05_3'}

# Summary

In [None]:
dfa = pd.concat([dfm.loc[[l in macs2levels2process for l in dfm['level']]],
                 dfsc.loc[[l in sicerlevels2process for l in dfsc['level']]],
                 dfs.loc[[l in spanlevels2process for l in dfs['level']]]])

# Group analysis

In [None]:
def plot_data_cells(df, cid, value, description):
    cids = sorted(set(df[cid]))
    axs = {}
    total = len(cids) * len(MODIFICATIONS)
    plt.figure(figsize=(int(total / 3), 4))
    offset = 0
    for m in MODIFICATIONS:
        data = df.loc[df['modification'] == m].sort_values(by=[cid])
        xlabels = []
        for c in data[cid]:
            if c not in xlabels:
                xlabels.append(c)
        w = len(cids)
        ax = plt.subplot2grid((1, total), (0, offset), colspan=w)
#         sns.boxplot(data=data, x=cid, y=value, ax=ax)
        sns.barplot(data=data, 
                     x=cid, y=value,
                     capsize=.2, errwidth=2,
                     alpha=0.8,
                     ax = ax)

        sns.swarmplot(data=data,
                      x=cid, y=value,
                      size=1,
                      color="black",
                      alpha=0.5,
                      ax = ax)
        ax.legend().set_visible(False)
        axs[ax] = plt.ylim()
        if offset > 0:
            ax.get_yaxis().set_ticklabels([])
            ax.set_ylabel('')
        else:
            ax.set_ylabel(description)
        
        offset += w
        ax.set_xlabel('')
        ax.set_title(m)
        plt.xticks(range(0, len(xlabels)), xlabels, rotation=90)
            
    ymin = np.min([v[0] for v in axs.values()])
    ymax = np.max([v[1] for v in axs.values()])
 
    for ax in axs.keys():
        ax.set_ylim(bottom = ymin, top = ymax)
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, f'{value}.{EXT}'), bbox_inches='tight', dpi=300)
    plt.show()


In [None]:
# Limit max number of peaks for visual representation
dfa['peaks'].clip(upper=100000, inplace=True)
dfa['av_length'].clip(upper=15000, inplace=True)

In [None]:
plot_data_cells(dfa, 'level', 'peaks', 'Peaks')
plot_data_cells(dfa, 'level', 'av_length', 'Average peak length')

# Consistency analysis between replicates

In [None]:
import tempfile

reps_overlap = pd.DataFrame(columns=['modification', 'tool', 'rep1', 'rep2',
                                      'peaks1', 'peaks2', 'length1', 'length2',
                                      'overlap1', 'overlap2', 'intersect'], dtype=object)

tools = list(sorted(set(dfa['level'])))
for m in tqdm(MODIFICATIONS):
    print(m)
    tm = dfa.loc[dfa['modification'] == m]
    reps = list(sorted(set(tm['replicate']))) 
    for tool in tools:
        for i in range(len(reps)):
            for j in range(i + 1, len(reps)):
                rep1, rep2 = reps[i], reps[j]
                t1 = tm.loc[(tm['level'] == tool) & (tm['replicate'] == rep1)]
                t2 = tm.loc[(tm['level'] == tool) & (tm['replicate'] == rep2)]
                m = t1['modification'].values[0]
                file1 = t1['file'].values[0]
                peaks1 = t1['peaks'].values[0]
                length1 = bedl(file1).sum()
                file2 = t2['file'].values[0]
                peaks2 = t2['peaks'].values[0]
                length2 = bedl(file2).sum()
                overlap1 = !bedtools intersect -a {file1} -b {file2} -wa -u | wc -l
                overlap1 = int(overlap1[0])
                overlap2 = !bedtools intersect -b {file1} -a {file2} -wa -u | wc -l            
                overlap2 = int(overlap2[0])            
                tf = tempfile.mktemp() 
                !bedtools intersect -b {file1} -a {file2} > {tf}
                intersectionl = bedl(tf).sum()
                reps_overlap.loc[len(reps_overlap)] = \
                    (m, tool, rep1, rep2, peaks1, peaks2, length1, length2, 
                     overlap1, overlap2, intersectionl)

In [None]:
ro = pd.DataFrame(columns=['Modification', 'Replicate', 'Tool', 'Overlap'], dtype=object)
for _, row in reps_overlap.iterrows():
    m = row['modification']
    tool = row['tool']
    r1 = row['rep1']
    r2 = row['rep2']
    peaks1 = row['peaks1']
    peaks2 = row['peaks2']
    overlap1 = row['overlap1']
    overlap2 = row['overlap2']    
    ro.loc[len(ro)] = (m, f'{rep1} vs {rep2}', tool, overlap1 / peaks1 if peaks1 !=0 else 0)
    ro.loc[len(ro)] = (m, f'{rep2} vs {rep1}', tool, overlap2 / peaks2 if peaks2 !=0 else 0)

plt.figure(figsize=(10, 3))
plt.title('Overlap between replicates')
sns.barplot(data=ro, x='Modification', y='Overlap', hue='Tool', capsize=.1, errwidth=2, edgecolor="black")
# sns.boxplot(data=ro, x='Modification', y='Overlap', hue='Tool')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'overlap.{EXT}'), bbox_inches='tight', dpi=300)
plt.show()

In [None]:
rj = pd.DataFrame(columns=['Modification', 'Replicate', 'Tool', 'Jaccard'], dtype=object)
for _, row in reps_overlap.iterrows():
    m = row['modification']
    tool = row['tool']
    rep1 = row['rep1']
    rep2 = row['rep2']
    length1 = row['length1']
    length2 = row['length2']
    intersection = row['intersect']
    try:
        jaccard = intersection / (length1 + length2 - intersection)
    except:
        jaccard = 0
    rj.loc[len(rj)] = (m, f'{rep1} vs {rep2}', tool, jaccard)

plt.figure(figsize=(10, 3))
plt.title('Jaccard overlap between replicates')
sns.barplot(data=rj, x='Modification', y='Jaccard', hue='Tool', capsize=.1, errwidth=2, edgecolor="black")
# sns.boxplot(data=rj, x='Modification', y='Jaccard', hue='Tool')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'jaccard.{EXT}'), bbox_inches='tight', dpi=300)
plt.show()

# Consistency analysis between different tools

In [None]:
import tempfile

tools_overlap = pd.DataFrame(columns=['modification', 'rep', 'tool1', 'tool2',
                                      'peaks1', 'peaks2', 'length1', 'length2',
                                      'overlap1', 'overlap2', 'intersect'], dtype=object)

tools = list(sorted(set(dfa['level'])))
for m in tqdm(MODIFICATIONS):
    print(m)
    tm = dfa.loc[dfa['modification'] == m]
    reps = list(sorted(set(tm['replicate']))) 
    for rep in reps:
        for i in range(len(tools)):
            for j in range(i + 1, len(tools)):
                tool1, tool2 = tools[i], tools[j]
                t1 = tm.loc[(tm['level'] == tool1) & (tm['replicate'] == rep)]
                t2 = tm.loc[(tm['level'] == tool2) & (tm['replicate'] == rep)]
                m = t1['modification'].values[0]
                file1 = t1['file'].values[0]
                peaks1 = t1['peaks'].values[0]
                length1 = bedl(file1).sum()
                file2 = t2['file'].values[0]
                peaks2 = t2['peaks'].values[0]
                length2 = bedl(file2).sum()
                overlap1 = !bedtools intersect -a {file1} -b {file2} -wa -u | wc -l
                overlap1 = int(overlap1[0])
                overlap2 = !bedtools intersect -b {file1} -a {file2} -wa -u | wc -l            
                overlap2 = int(overlap2[0])            
                tf = tempfile.mktemp() 
                !bedtools intersect -b {file1} -a {file2} > {tf}
                intersectionl = bedl(tf).sum()
                tools_overlap.loc[len(tools_overlap)] = \
                    (m, rep, tool1, tool2, peaks1, peaks2, length1, length2, 
                     overlap1, overlap2, intersectionl)

## Overlap between tools

In [None]:
to = pd.DataFrame(columns=['modification', 'replicate', 'tool1', 'tool2', 'overlap'], dtype=object)
for _, row in tools_overlap.iterrows():
    m = row['modification']
    rep = row['rep']
    tool1 = row['tool1']
    tool2 = row['tool2']
    peaks1 = row['peaks1']
    peaks2 = row['peaks2']
    overlap1 = row['overlap1']
    overlap2 = row['overlap2']    
    to.loc[len(to)] = (m, rep, tool1, tool2, overlap1 / peaks1 if peaks1 !=0 else 0)
    to.loc[len(to)] = (m, rep, tool2, tool1, overlap2 / peaks2 if peaks2 !=0 else 0)
to

In [None]:
from math import ceil
#Heatmaps
fig = plt.figure(figsize=(20, 6))

for i, m in enumerate(MODIFICATIONS):
    print(m)
    tom = to[to['modification'] == m].groupby(['tool1', 'tool2'])['overlap'].mean().reset_index().copy()
    tomp = tom.pivot(index='tool1', columns='tool2', values='overlap').fillna(1.0)
    ax = plt.subplot(2, int(ceil(len(MODIFICATIONS) / 2)) + 1, i + 1)
    sns.heatmap(tomp, ax=ax, vmin=0, vmax=1, annot=True, fmt='.2f', 
                yticklabels=(i==0 or i==6), cbar=False, cmap='Blues')
    ax.set_xlabel(m)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'overlap_tools.{EXT}'), bbox_inches='tight', dpi=300)
plt.show()

## Jaccard between tools

In [None]:
tj = pd.DataFrame(columns=['modification', 'replicate', 'tool1', 'tool2', 'jaccard'], dtype=object)
for _, row in tools_overlap.iterrows():
    m = row['modification']
    rep = row['rep']
    tool1 = row['tool1']
    tool2 = row['tool2']
    length1 = row['length1']
    length2 = row['length2']
    intersection = row['intersect']
    try:
        jaccard = intersection / (length1 + length2 - intersection)
    except:
        jaccard = 0
    tj.loc[len(tj)] = (m, rep, tool1, tool2, jaccard)

In [None]:
#Heatmaps
fig = plt.figure(figsize=(20, 6))

for i, m in enumerate(MODIFICATIONS):
    print(m)
    tjm = tj[tj['modification'] == m].copy()
    for t in tools:
        tjm.loc[len(tjm)] = (m, 'rep', t, t, 1.0)
    tjm = tjm.groupby(['tool1', 'tool2'])['jaccard'].mean().reset_index().copy()
    tjmp = tjm.pivot(index='tool1', columns='tool2', values='jaccard')
    ax  = plt.subplot(2, int(ceil(len(MODIFICATIONS) / 2)) + 1, i + 1)
    sns.heatmap(tjmp, ax=ax, vmin=0, vmax=1, annot=True, fmt='.2f', 
                yticklabels=(i==0 or i==6), cbar=False, cmap='Blues')
    ax.set_xlabel(m)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'jaccard_tools.{EXT}'), bbox_inches='tight', dpi=300)
plt.show()