# Analysis of ChIP-seq simulation

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
import os
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from collections import OrderedDict, Counter
from tqdm.auto import tqdm
import subprocess, tempfile

In [None]:
WORK_DIR = '/mnt/stripe/shpynov/2021_chips'

FIGURES_DIR = '/mnt/stripe/shpynov/2021_chips/figures'
EXT = 'png'
! mkdir -p {FIGURES_DIR}
! rm {FIGURES_DIR}/*

# Chips models analysis

In [None]:
import json

models_df = pd.DataFrame(columns=['Modification', 'Model', 'k', 'theta', 'prc_rate', 'f', 's'],
                         dtype=object)
for modification in ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']:
    path = os.path.join(WORK_DIR, 'peaks', f'{modification}.json')
    with open(path) as f:
        m = json.load(f)
        k, theta, prc_rate, f, s = m['frag']['k'], m['frag']['theta'], m['pcr_rate'], \
            m['pulldown']['f'], m['pulldown']['s']
        models_df.loc[len(models_df)] = (modification, path, k, theta, prc_rate, f, s)
models_df.to_csv(os.path.join(WORK_DIR, 'models.tsv'), sep='\t', index=None)
models_df

In [None]:
fig = plt.figure(figsize=(6, 6))
sns.scatterplot(data=models_df, x='k', y='theta', hue='Modification', markers=True)
plt.title(f'Fragment lengths gamma distribution')
plt.savefig(os.path.join(FIGURES_DIR, f'models_fragments_gamma.{EXT}'))
plt.show()

In [None]:
fig = plt.figure(figsize=(6, 6))
sns.scatterplot(data=models_df, x='s', y='f', hue='Modification', markers=True)
plt.title(f'Fraction of Reads in Peaks vs Bound Genome Fraction')
plt.xlabel('Fraction of Reads in Peaks')
plt.ylabel('Bound Genome Fraction')
plt.savefig(os.path.join(FIGURES_DIR, f'models_frip_vs_bound_genome.{EXT}'))
plt.show()

# Report analysis

In [None]:
df = pd.read_csv(os.path.join(WORK_DIR, 'report.tsv'), sep='\t', comment='#')

# Focus on specific tools only
df = df.loc[df['Tool'].isin(set(['Macs2', 'Macs2Broad', 'SICER', 'SPAN-GAP5', 'SPAN-NZ2']))]

# Without modified model
df = df.loc[df['Tool'] != 'SPAN-NZ2']

# Without mixed model
df = df.loc[df['Modification'] != 'mixed']

# Rename
df.loc[df['Tool']=='Macs2Broad', 'Tool'] = 'Macs2 broad'
df.loc[df['Tool']=='SPAN-GAP5', 'Tool'] = 'SPAN'

# Fix SPAN default naming to avoid duplicate record
df = df.loc[['_1E-6_' not in pf for pf in df['PeaksFile']]]
df['Fdr'] = df['Fdr'].astype(float)

# Remove relaxed FDR setting to avoid explosure in #peaks
# df = df.loc[df['Fdr'] <= 0.05]
df

In [None]:
df.sort_values(by=['Modification', 'Mult', 'Library', 'Tool', 'Fdr'], inplace=True)

In [None]:
df['TrueAverageLength'] = df['TrueLength'] / df['TruePeaks']
df['AverageLength'] = df['Length'] / df['Peaks']
df.fillna(0, inplace=True)

In [None]:
df['LogFdr'] = np.log10(df['Fdr'])
df.loc[df['Fdr']==0.05, 'LogFdr'] = -1.3

In [None]:
df['Precision'] = df['PrecisionP'] / df['TruePeaks']
df['Recall'] = df['RecallP'] / df['Peaks']
df['PrecisionL'] = df['Intersection'] / df['TrueLength']
df['RecallL'] = df['Intersection'] / df['Length']
df.fillna(1, inplace=True)

df['Jaccard'] = df['Intersection'] / (df['Length'] + df['TrueLength'] - df['Intersection'])
df.fillna(0, inplace=True)

In [None]:
print('Limit peaks number for visual representation')
df.loc[df['Peaks'] > 2000, 'Peaks'] = 2000
print('Limit average length for visual representation')
df.loc[df['AverageLength'] > 15000, 'AverageLength'] = 15000

In [None]:
print('Computing F1-score')
df['F1'] = [2 / (1 / p + 1 / r) if min(p, r) > 0 else 0 for p, r in zip(df['Precision'], df['Recall'])]
df['F1l'] = [2 / (1 / p + 1 / r) if min(p, r) > 0 else 0 for p, r in zip(df['PrecisionL'], df['RecallL'])]
df.head()

# Summary number of peaks

In [None]:
print('Ground truth average number of peaks')
fig = plt.figure(figsize=(6, 5))
sns.boxplot(data=df, x="Modification", y="TruePeaks")
plt.xticks(rotation=45, ha='right')
plt.ylim(-20, df['TruePeaks'].max() * 1.2)
plt.savefig(os.path.join(FIGURES_DIR, f'true_peaks_number.{EXT}'))
plt.show()        

In [None]:
print('Ground truth average length of peaks')
fig = plt.figure(figsize=(6, 5))
sns.boxplot(data=df, x="Modification", y="TrueAverageLength")
plt.xticks(rotation=45, ha='right')
plt.ylim(-20, df['TrueAverageLength'].max() * 1.2)
plt.savefig(os.path.join(FIGURES_DIR, f'true_peaks_lengths.{EXT}'))
plt.show()        

In [None]:
cmap = plt.cm.get_cmap('jet', len(set(df['Tool'])))
palette = {t: cmap(i) for i, t in enumerate(sorted(set(df['Tool'])))}

In [None]:
def summary_peaks_lengths(df, name):
    for m in sorted(set(df['Modification'])):
        t = df.loc[df['Modification'] == m].copy()
        t.sort_values(by=['Tool', 'Fdr'], inplace=True)

        fig = plt.figure(figsize=(15, 5))

        ax = plt.subplot(1, 2, 1)
        print('Peaks', m)
        print(f"True peaks {set(t['TruePeaks'])}")
        sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
        plt.title(f'{m} {name}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper left')
#             plt.ylim(-20, t['Peaks'].max() * 1.2)

        ax = plt.subplot(1, 2, 2)            
        print('Length', m)
        print(f"True average length {set(t['TrueAverageLength'])}")        
        sns.boxplot(data=t, x="Fdr", y="AverageLength", hue="Tool", palette=palette)
        plt.title(f'{m} {name}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper left')
#             plt.ylim(-20, t['AverageLength'].max() * 1.2)

        plt.savefig(os.path.join(FIGURES_DIR, f'summary_peaks_length_{name}_{m}.{EXT}'))
        plt.show()        

In [None]:
def good(df):
    return (df['Library'] == '1mln') & (df['Mult'] == 1.0)

def low_frip(df):
    # Avoid to big coverage - related outliers
    return (df['Library'] == '500k') & (df['Mult'] == 0.2)

def low_cov(df):
    return df['Library'] == '200k'

def with_quality(df, add_all=False):
    ts = []
    t = df.loc[good(df)].copy()
    t['Quality'] = 'Good'
    ts.append(t)
    t = df.loc[low_frip(df)].copy()
    t['Quality'] = 'Low FRIP'
    ts.append(t)    
    t = df.loc[low_cov(df)].copy()
    t['Quality'] = 'Low Coverage'
    ts.append(t)
    if add_all:
        t = df.copy()
        t['Quality'] = 'All'
        ts.append(t)
    return pd.concat(ts)

In [None]:
# Supplementary
# print('Summary number of peaks and length good quality')
# summary_peaks_lengths(df.loc[good(df)], 'good_quality')

In [None]:
# Supplementary
# print('Summary number of peaks and length on low coverage')
# summary_peaks_lengths(df.loc[low_cov(df)], 'low_coverage')

In [None]:
# Supplementary
# print('Summary number of peaks and length on low frip')
# summary_peaks_lengths(df.loc[low_frip(df)], 'low_frip')

In [None]:
# Supplementary
# print('Summary number of peaks and length')
# summary_peaks_lengths(df, 'all')

## Aggregated results

In [None]:
def aggregated_info_fdr(df, fdr):
    df = df.loc[df['Fdr'] == fdr]
    df = with_quality(df)
    
    for q in sorted(set(df['Quality'])):
        t = df.loc[df['Quality'] == q].copy()
        t.sort_values(by=['Modification', 'Tool', 'Fdr'], inplace=True)

        fig = plt.figure(figsize=(8, 4))
        print('Peaks', q)
        print(f"True peaks {set(t['TruePeaks'])}")
        sns.boxplot(data=t, x="Modification", y="Peaks", hue="Tool", palette=palette)
        plt.title(f'Number of peaks {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-20, t['Peaks'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_peaks_{q}_fdr{fdr}.{EXT}'))
        plt.show() 

        fig = plt.figure(figsize=(8, 4))
        print('Length', q)
        print(f"True average length {set(t['TrueAverageLength'])}")        
        sns.boxplot(data=t, x="Modification", y="AverageLength", hue="Tool", palette=palette)
        plt.title(f'Average length {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-20, t['AverageLength'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_lengths_{q}_fdr{fdr}.{EXT}'))
        plt.show()         

        fig = plt.figure(figsize=(8, 4))
        print('Jaccard', q)
        sns.boxplot(data=t, x="Modification", y="Jaccard", hue="Tool", palette=palette)
        plt.title(f'Average jaccard {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['Jaccard'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'jaccard_{q}_fdr{fdr}.{EXT}'))
        plt.show()        


        fig = plt.figure(figsize=(8, 4))
        print('Precision', q)
        sns.boxplot(data=t, x="Modification", y="Precision", hue="Tool", palette=palette)
        plt.title(f'Average precision {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['Precision'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'precision_{q}_fdr{fdr}.{EXT}'))
        plt.show()        
        
        fig = plt.figure(figsize=(8, 4))
        print('Recall', q)
        sns.boxplot(data=t, x="Modification", y="Recall", hue="Tool", palette=palette)
        plt.title(f'Average recall {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['Recall'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'recall_{q}_fdr{fdr}.{EXT}'))
        plt.show()                
        
        fig = plt.figure(figsize=(8, 4))
        print('F1', q)
        sns.boxplot(data=t, x="Modification", y="F1", hue="Tool", palette=palette)
        plt.title(f'Average f1 {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['F1'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'f1_{q}_fdr{fdr}.{EXT}'))
        plt.show()                


        fig = plt.figure(figsize=(8, 4))
        print('PrecisionL', q)
        sns.boxplot(data=t, x="Modification", y="PrecisionL", hue="Tool", palette=palette)
        plt.title(f'Average precision {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['Precision'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'precisionl_{q}_fdr{fdr}.{EXT}'))
        plt.show()        
        
        fig = plt.figure(figsize=(8, 4))
        print('RecallL', q)
        sns.boxplot(data=t, x="Modification", y="RecallL", hue="Tool", palette=palette)
        plt.title(f'Average recall {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['Recall'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'recalll_{q}_fdr{fdr}.{EXT}'))
        plt.show()                

        fig = plt.figure(figsize=(8, 4))
        print('F1l', q)
        sns.boxplot(data=t, x="Modification", y="F1l", hue="Tool", palette=palette)
        plt.title(f'Average f1l {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='lower left')
        plt.ylim(-0.1, t['F1'].max() * 1.2)
        plt.savefig(os.path.join(FIGURES_DIR, f'f1l_{q}_fdr{fdr}.{EXT}'))
        plt.show()                 

In [None]:
aggregated_info_fdr(df, 0.05)

In [None]:
df.loc[(df['Modification'] == 'H3K4me3') & (df['Peaks'] > 1000) & (df['Tool'] == 'SPAN') & (df['Fdr'] == 0.05)]

In [None]:
def aggregated_peaks_lengths(df):
    df = with_quality(df)
    
    for q in sorted(set(df['Quality'])):
        t = df.loc[df['Quality'] == q].copy()
        t.sort_values(by=['Modification', 'Tool', 'Fdr'], inplace=True)

        fig = plt.figure(figsize=(12, 4))
        print('Peaks', q)
        print(f"True peaks {set(t['TruePeaks'])}")
        sns.catplot(data=t, x="Tool", y="Peaks", hue="Fdr", col="Modification", 
                    kind="box", aspect=.6, palette='mako')
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_peaks_{q}.{EXT}'))
        plt.show() 

        fig = plt.figure(figsize=(12, 4))
        print('Length', q)
        print(f"True average length {set(t['TrueAverageLength'])}")        
        sns.catplot(data=t, x="Tool", y="AverageLength", hue="Fdr", col="Modification", 
                    kind="box", aspect=.6, palette='mako')

        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_length_{q}.{EXT}'))
        plt.show()        

In [None]:
# Supplementary
# aggregated_peaks_lengths(df)

## Peaks vs Average length

In [None]:
def summary_vs(df, name, v1, v2):
    for m in sorted(set(df['Modification'])):
        print(m)
        t = df.loc[df['Modification'] == m]
        fig = plt.figure(figsize=(12, 6))
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for i in set(t['I']):
                    for tool in set(t['Tool']):
                        tt = t.loc[(df['Mult'] == mult) & (t['Library'] == lib) & (t['I'] == i)].copy()
                        tt.sort_values(by=['Tool', 'Fdr'], inplace=True)
                        sns.lineplot(data=tt, x=v1, y=v2, hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.2)
#             plt.ylim(-20, t['AverageLength'].max() * 1.2)
        # Single legend
        handles, labels = plt.axes().get_legend_handles_labels()
        plt.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                   labels=labels[:len(set(t['Tool'])) + 1],
                  loc='lower left')
        plt.title(f'{m} {name}')
        plt.savefig(os.path.join(FIGURES_DIR, f'summary_{v1}_vs_{v2}_{m}_{name}.{EXT}'))
        plt.show()

In [None]:
# Supplementary
# print('Peaks vs Average Length')
# summary_vs(df.loc[good(df)], 'good_quality', 'Peaks', 'AverageLength')

In [None]:
def peaks_lengths(df):
    for m in sorted(set(df['Modification'])):
        print('Peaks', m)
        print(f"True peaks {set(df[df['Modification'] == m]['TruePeaks'])}")
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
                plt.title(f'{m} {lib} {mult}')
                plt.xticks(rotation=45, ha='right')
                plt.ylim(-20, t['Peaks'].max() * 1.2)
                if i == 1:
                    ax.legend(loc='upper left')
                else:
                    ax.get_legend().remove()    
                i += 1
        plt.savefig(os.path.join(FIGURES_DIR, f'peaks_{m}.{EXT}'))
        plt.show()

        print('Length', m)
        print(f"True average length {set(df[df['Modification'] == m]['TrueAverageLength'])}")        
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                sns.boxplot(data=t, x="Fdr", y="AverageLength", hue="Tool", palette=palette)
                plt.title(f'{m} {lib} {mult}')
                plt.xticks(rotation=45, ha='right')
                plt.ylim(-20, t['AverageLength'].max() * 1.2)
                if i == 1:
                    ax.legend(loc='upper left')
                else:
                    ax.get_legend().remove()    
                i += 1
        plt.savefig(os.path.join(FIGURES_DIR, f'length_{m}.{EXT}'))
        plt.show()        

In [None]:
# Supplementary
# print('Summary number of peaks and length')
# peaks_lengths(df)

## Precision / Recall

In [None]:
def precision_recall_detailed(df):
    for m in sorted(set(df['Modification'])):
        print(m)
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                for I in set(t['I']):
                    tt = t.loc[t['I'] == I].copy()
                    tt.sort_values(by=['Tool', 'Recall', 'Precision'], inplace=True)
                    sns.lineplot(data=tt, x='Recall', y='Precision', hue='Tool', 
                                 markers=True, style='I', palette=palette, alpha=0.5)
                if i == 1:
                    # Single legend
                    handles, labels = ax.get_legend_handles_labels()
                    ax.legend(handles=handles[:len(set(tt['Tool'])) + 1], 
                              labels=labels[:len(set(tt['Tool'])) + 1],
                             loc='lower left')
                else:
                    ax.get_legend().remove()

                i += 1
                plt.xlim(-0.1, 1.1)
                plt.ylim(-0.1, 1.1)
                plt.title(f'{m} {lib} {mult}')
        plt.savefig(os.path.join(FIGURES_DIR, f'prc_plot_{m}.{EXT}'))
        plt.show()

In [None]:
# Supplementary
# print('Detailed Precision / Recall plot')
# precision_recall_detailed()

In [None]:
def precision_recall(df, name):
    for m in sorted(set(df['Modification'])):
        print('Peaks', name, m)
        t = df.loc[df['Modification'] == m]
        fig = plt.figure(figsize=(6, 6))
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for i in set(t['I']):
                    for tool in set(t['Tool']):
                        tt = t.loc[(df['Mult'] == mult) & (t['Library'] == lib) & (t['I'] == i)].copy()
                        tt.sort_values(by=['Tool', 'Recall', 'Precision'], inplace=True)
                        sns.lineplot(data=tt, x='Recall', y='Precision', hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.2)

        # Single legend
        handles, labels = ax.get_legend_handles_labels()
        plt.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                  labels=labels[:len(set(t['Tool'])) + 1],
                  loc='lower left')
        plt.xlim(min(0.6, t['Recall'].min()), 1.1)
        plt.ylim(min(0.6, t['Precision'].min()), 1.1)
        plt.title(f'{m} {name}')
        plt.savefig(os.path.join(FIGURES_DIR, f'precision_recall_{name}_{m}_{name}.{EXT}'))
        plt.show()

In [None]:
# Supplementary
# print('Precision / Recall')
# precision_recall(df.loc[good(df)], 'good_quality')

In [None]:
# Supplementary
# print('Precision / Recall low coverage')
# precision_recall(df.loc[low_cov(df)], 'low_coverage')

In [None]:
# Supplementary
# print('Precision / Recall low frip')
# precision_recall(df[low_frip(df)], 'low_frip')

## Aggregated precision / recall

In [None]:
def aggregated_precision_recall(df, name):
    print('Peaks', name)
    fig = plt.figure(figsize=(18, 3))
    for i, m in enumerate(sorted(set(df['Modification']))):
        ax = plt.subplot(1, len(set(df['Modification'])), i+1)
        tt = df.loc[df['Modification'] == m]
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for I in set(tt['I']):
                    for tool in set(tt['Tool']):
                        ttt = tt.loc[(tt['Mult'] == mult) & (tt['Library'] == lib) & (tt['I'] == I)].copy()
                        ttt.sort_values(by=['Tool', 'Recall', 'Precision'], inplace=True)
                        sns.lineplot(data=ttt, x='Recall', y='Precision', hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.1)

        if i == 0:
            # Single legend
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles[:len(set(tt['Tool'])) + 1], 
                      labels=labels[:len(set(tt['Tool'])) + 1],
                      loc='lower left')
        else:
            ax.get_legend().remove()
        plt.xlim(-0.1, 1.1)
        plt.ylim(-0.1, 1.1)
        plt.title(m)
    plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_precision_recall_{name}.{EXT}'))
    plt.show()

In [None]:
print('Precision / Recall')
aggregated_precision_recall(df.loc[good(df)], 'good_quality')

In [None]:
print('Precision / Recall low coverage')
aggregated_precision_recall(df.loc[low_cov(df)], 'low_coverage')

In [None]:
print('Precision / Recall low frip')
aggregated_precision_recall(df.loc[low_frip(df)], 'low_frip')

In [None]:
# aggregated_precision_recall(df, 'All') 

In [None]:
def aggregated_precision_recall_length(df, name):
    print('Peaks', name)
    fig = plt.figure(figsize=(18, 3))
    for i, m in enumerate(sorted(set(df['Modification']))):
        ax = plt.subplot(1, len(set(df['Modification'])), i+1)
        tt = df.loc[df['Modification'] == m]
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for I in set(tt['I']):
                    for tool in set(tt['Tool']):
                        ttt = tt.loc[(tt['Mult'] == mult) & (tt['Library'] == lib) & (tt['I'] == I)].copy()
                        ttt.sort_values(by=['Tool', 'RecallL', 'PrecisionL'], inplace=True)
                        sns.lineplot(data=ttt, x='RecallL', y='PrecisionL', hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.1)

        if i == 0:
            # Single legend
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles[:len(set(tt['Tool'])) + 1], 
                      labels=labels[:len(set(tt['Tool'])) + 1],
                      loc='lower left')
        else:
            ax.get_legend().remove()
        plt.xlim(-0.1, 1.1)
        plt.ylim(-0.1, 1.1)
        plt.title(m)
    plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_precision_recall_length_{name}.{EXT}'))
    plt.show()

In [None]:
print('Precision / Recall length')
aggregated_precision_recall_length(df.loc[good(df)], 'good_quality')

In [None]:
print('Precision / Recall low coverage length')
aggregated_precision_recall_length(df.loc[low_cov(df)], 'low_coverage')

In [None]:
print('Precision / Recall low frip length')
aggregated_precision_recall_length(df.loc[low_frip(df)], 'low_frip')

# Average precision

AP summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight:

AP = ∑(Rn−Rn−1)*Pn, where  Pn and Rn are the precision and recall at the nth threshold.

This implementation is not interpolated and is different from computing the area under the precision-recall curve with the trapezoidal rule, which uses linear interpolation and can be too optimistic.

In [None]:
dfap = pd.DataFrame(columns=['Modification', 'Library', 'Mult', 'I', 'Tool', 'AP', 'APL'], dtype=object)
for m in sorted(set(df['Modification'])):
    print(m)
    for lib in ['1mln', '500k', '200k']:
        for mult in [1.0, 0.5, 0.2]:
            t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
            for tool in sorted(set(t['Tool'])):
                for i in sorted(set(t['I'])):
                    tt = t.loc[(t['Tool']==tool) & (t['I'] == i)].copy()
                    tt.sort_values(by=['Recall', 'Precision'], inplace=True)
                    ap = 0
                    apl = 0
                    rprev = 0
                    rprevl = 0
                    for _, row in tt.iterrows():
                        ap += (row['Recall'] - rprev)*row['Precision']
                        rprev = row['Recall']
                        apl += (row['RecallL'] - rprevl)*row['PrecisionL']
                        rprevl = row['RecallL']
                    dfap.loc[len(dfap)] = [m, lib, mult, i, tool, ap, apl]


# Aggregated AP score

In [None]:
def plot_aggregated_ap(dfap):
    dfap = with_quality(dfap, True)

    for q in sorted(set(dfap['Quality'])):
        print(q)
        fig = plt.figure(figsize=(12, 4))
        tt = dfap.loc[(dfap['Quality'] == q)].copy()
        tt.sort_values(by=['Modification', 'Tool'], inplace=True)
        sns.boxplot(x='Modification', y='AP', hue='Tool', data=tt)
#         plt.title(f'{m} {name}')
#         ax.legend(loc='lower left')
        plt.ylim(-0.1, 1.1)
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_ap_{q}.{EXT}'))
        plt.show()


In [None]:
plot_aggregated_ap(dfap)

In [None]:
def plot_aggregated_apl(dfap):
    dfap = with_quality(dfap, True)

    for q in sorted(set(dfap['Quality'])):
        print(q)
        fig = plt.figure(figsize=(12, 4))
        tt = dfap.loc[(dfap['Quality'] == q)].copy()
        tt.sort_values(by=['Modification', 'Tool'], inplace=True)
        sns.boxplot(x='Modification', y='APL', hue='Tool', data=tt)
#         plt.title(f'{m} {name}')
#         ax.legend(loc='lower left')
        plt.ylim(-0.1, 1.1)
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_apl_{q}.{EXT}'))
        plt.show()



In [None]:
plot_aggregated_apl(dfap)

# Average precision score detailed

In [None]:
def avp_detailed(dfap):
    for m in sorted(set(dfap['Modification'])):
        print(m)
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                tt = dfap.loc[(dfap['Modification'] == m) & 
                              (dfap['Mult'] == mult) & (dfap['Library'] == lib)].copy()
                ax = plt.subplot(3, 3, i)
                tt.sort_values(by=['Tool'], inplace=True)
                sns.boxplot(x= 'Tool', y='AP', data=tt, palette=palette)
                plt.title(f'{m} {lib} {mult}')
                if i == 1:
                    ax.legend(loc='lower left')
                else:
                    pass
    #                 ax.get_legend().remove()
                i += 1
                plt.ylim(-0.1, 1.1)
        plt.savefig(os.path.join(FIGURES_DIR, f'ap_{m}.{EXT}'))
        plt.show()

In [None]:
# Supplementary
# print('AP for various conditions')
# avp_detailed()

In [None]:
def avpl_detailed(dfap):
    for m in sorted(set(dfap['Modification'])):
        print(m)
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                tt = dfap.loc[(dfap['Modification'] == m) & 
                              (dfap['Mult'] == mult) & (dfap['Library'] == lib)].copy()
                ax = plt.subplot(3, 3, i)
                tt.sort_values(by=['Tool'], inplace=True)
                sns.boxplot(x= 'Tool', y='APL', data=tt, palette=palette)
                plt.title(f'{m} {lib} {mult}')
                if i == 1:
                    ax.legend(loc='lower left')
                else:
                    pass
    #                 ax.get_legend().remove()
                i += 1
                plt.ylim(-0.1, 1.1)
        plt.savefig(os.path.join(FIGURES_DIR, f'apl_{m}.{EXT}'))
        plt.show()

In [None]:
# Supplementary
# print('APL for various conditions')
# avpl_detailed()

# Coverage of peaks analysis H3K4me3

```
 ~= 290
# BED to tags
cd /mnt/stripe/shpynov/2021_chips_H3K4me3
SHIFT=145
for PILEUP_BED in *.bed; do
   echo ${PILEUP_BED};
   cat ${PILEUP_BED} |\
      awk -v OFS='\t' -v S=${SHIFT} \
      '{if ($6 != "-") {print($1, $2+S, $2+S+1)} else {if ($3-S>=1) {print($1, $3-S, $3-S+1)}}}' |\
      sort -u -k1,1 -k3,3n -k2,2n > ${PILEUP_BED/.bed/.tags}
done

# Add coverage information to peaks
cd /mnt/stripe/shpynov/2021_chips_H3K4me3/peaks
for F in *; do echo $F; cat $F | awk -v OFS='\t' '{print $1,$2,$3,$4}' > $F.bed4; done

for F in *.narrowPeak.bed4; do echo $F; BF=/mnt/stripe/shpynov/2021_chips_H3K4me3/${F/_q0.05*/.tags}; echo $BF; 
   bedtools intersect -a $F -b $BF -wa -c > ${F}t; 
   bedtools intersect -a ${F}t -b /mnt/stripe/shpynov/2021_chips_H3K4me3/input_H3K4me3_chr15.tags -wa -c > ${F}tc; 
   rm ${F}t; 
done
for F in *.broadPeak.bed4; do echo $F; BF=/mnt/stripe/shpynov/2021_chips_H3K4me3/${F/_broad0.05*/.tags}; echo $BF; 
   bedtools intersect -a $F -b $BF -wa -c > ${F}t; 
   bedtools intersect -a ${F}t -b /mnt/stripe/shpynov/2021_chips_H3K4me3/input_H3K4me3_chr15.tags -wa -c > ${F}tc; 
   rm ${F}t; 
done

for F in *.peak.bed4; do echo $F; BF=/mnt/stripe/shpynov/2021_chips_H3K4me3/${F/_200_0.05*/.tags}; echo $BF; 
   bedtools intersect -a $F -b $BF -wa -c > ${F}t; 
   bedtools intersect -a ${F}t -b /mnt/stripe/shpynov/2021_chips_H3K4me3/input_H3K4me3_chr15.tags -wa -c > ${F}tc; 
   rm ${F}t; 
done

for F in *FDR0.05.bed4; do echo $F; BF=/mnt/stripe/shpynov/2021_chips_H3K4me3/${F/-W200*/.tags}; echo $BF; 
   bedtools intersect -a $F -b $BF -wa -c > ${F}t; 
   bedtools intersect -a ${F}t -b /mnt/stripe/shpynov/2021_chips_H3K4me3/input_H3K4me3_chr15.tags -wa -c > ${F}tc; 
   rm ${F}t; 
done

TRUE_PEAKS=/mnt/stripe/shpynov/2021_chips/fastq/H3K4me3_chr15_1.bed
cat $TRUE_PEAKS | awk -v OFS='\t' '{print $1,$2,$3,$4}' > tp.bed4;
for Q in 1.0_1mln 1.0_200k 0.2_1mln 0.2_200k; do
   echo $Q; BF=/mnt/stripe/shpynov/2021_chips_H3K4me3/H3K4me3_chr15_1_${Q}.tags;
   bedtools intersect -a tp.bed4 -b $BF -wa -c > tp_${Q}.bed4t; 
   bedtools intersect -a tp_${Q}.bed4t -b /mnt/stripe/shpynov/2021_chips_H3K4me3/input_H3K4me3_chr15.tags -wa -c > tp_${Q}.bed4tc; 
   rm tp_${Q}.bed4t;
done


cd /mnt/stripe/shpynov/2021_chips_H3K4me3/
bedtools bamtobed -i /mnt/stripe/shpynov/2021_chips/original_bams/H3K4me3.bam > orig_H3K4me3.bed
bedtools bamtobed -i /mnt/stripe/shpynov/2021_chips/original_bams/input_K4me3_K27me3.bam > orig_input_H3K4me3.bed

SHIFT=145
for PILEUP_BED in orig_H3K4me3.bed orig_input_H3K4me3.bed do
   cat ${PILEUP_BED} |\
      awk -v OFS='\t' -v S=${SHIFT} \
      '{if ($6 != "-") {print($1, $2+S, $2+S+1)} else {if ($3-S>=1) {print($1, $3-S, $3-S+1)}}}' |\
      sort -u -k1,1 -k3,3n -k2,2n > ${PILEUP_BED/.bed/.tags}
done

cat /mnt/stripe/shpynov/2021_chips/peaks/H3K4me3_q0.05_peaks.narrowPeak | awk -v OFS='\t' '{print $1,$2,$3,$4}' > orig_H3K4me3_peaks.bed4
bedtools intersect -a orig_H3K4me3_peaks.bed4 -b ../orig_H3K4me3.tags -wa -c > orig_H3K4me3_peaks.bed4t
bedtools intersect -a orig_H3K4me3_peaks.bed4t -b ../orig_input_H3K4me3.tags -wa -c > orig_H3K4me3_peaks.bed4tc
rm orig_H3K4me3_peaks.bed4t
```

In [None]:
PATH2 = '/mnt/stripe/shpynov/2021_chips_H3K4me3'
TRUE_PEAKS = '/mnt/stripe/shpynov/2021_chips/fastq/H3K4me3_chr15_1.bed'
ORIG_PEAKS = '/mnt/stripe/shpynov/2021_chips_H3K4me3/peaks/orig_H3K4me3_peaks.bed4tc'
QS = ['1.0_1mln', '1.0_200k', '0.2_1mln', '0.2_200k']

ts = []
for q in QS:
    k4me3df = pd.DataFrame(dict(tool=['MACS2', 'MACS2 broad', 'SICER', 'SPAN'], file=[
        f'{PATH2}/peaks/H3K4me3_chr15_1_{q}_q0.05_peaks.narrowPeak.bed4tc',
        f'{PATH2}/peaks/H3K4me3_chr15_1_{q}_broad0.05_peaks.broadPeak.bed4tc',
        f'{PATH2}/peaks/H3K4me3_chr15_1_{q}-W200-G600-islands-summary-FDR0.05.bed4tc',
        f'{PATH2}/peaks/H3K4me3_chr15_1_{q}_200_0.05_5.peak.bed4tc'
    ]))
    k4me3df['q'] = q 
    ts.append(k4me3df)

k4me3df = pd.concat(ts)
k4me3df

In [None]:
def bedl(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[2] - tf[1]
    except:
        return [] # Empty file

def lines(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return len(tf)
    except:
        return 0 # Empty file
    
def d(a, b):
    return a / b if b != 0 else 0


def coverage_t(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[4]
    except:
        return [] # Empty file

def coverage_c(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        return tf[5]
    except:
        return [] # Empty file

coverage_chr15c = lines(f'{PATH2}/input_H3K4me3_chr15.tags')

coverage_origt = lines(f'{PATH2}/orig_H3K4me3.tags')
coverage_origc = lines(f'{PATH2}/orig_input_H3K4me3.tags')

def coverage_a(file):
    try:
        tf = pd.read_csv(file, sep='\t', header=None)
        if 'orig' in file:
            coveraget, coveragec = coverage_origt, coverage_origc
        else:
            coveraget = 10000000 if '1mln' in file else 200000
            coveragec = coverage_chr15c
        return (tf[4] - tf[5] * min(1, coveraget / coveragec)).clip(lower=0)
    except:
        return [] # Empty file

def zip3(cname, ctype, cov):
    return zip([cname] * len(cov), [ctype] * len(cov), cov)

k4me3_comparison = pd.DataFrame(columns=['q', 'tool1', 'tool2', 
                                             'peaks1', 'peaks2', 'length1', 'length2',
                                             'overlap1', 'overlap2', 'overlap1l', 'overlap2l', 
                                             'diff1', 'diff2', 'diff1l', 'diff2l',
                                             'cov1t', 'cov1c', 'cov1a', 
                                             'cov2t', 'cov2c', 'cov2a',
                                             'overlap1t', 'overlap1c', 'overlap1a',
                                             'overlap2t', 'overlap2c', 'overlap2a',                                      
                                             'diff1t', 'diff1c', 'diff1a',
                                             'diff2t', 'diff2c', 'diff2a'], dtype=object)
tools = list(sorted(set(k4me3df['tool'])))

tp_length = bedl(TRUE_PEAKS)
orig_length = bedl(ORIG_PEAKS)
orig_covt, orig_covc, orig_cova = coverage_t(ORIG_PEAKS), coverage_c(ORIG_PEAKS), coverage_a(ORIG_PEAKS)

for q in QS:
    print(q)
    tp_coverage_file = f'{PATH2}/peaks/tp_{q}.bed4tc'
    tpcovt, tpcovc, tpcova = coverage_t(tp_coverage_file), coverage_c(tp_coverage_file), coverage_a(tp_coverage_file)
    
    for i in tqdm(range(len(tools))):
        for j in range(i + 1, len(tools)):
            tool1, tool2 = tools[i], tools[j]
            t1 = k4me3df.loc[(k4me3df['q']==q) & (k4me3df['tool']==tool1)]
            t2 = k4me3df.loc[(k4me3df['q']==q) & (k4me3df['tool']==tool2)]

            file1 = t1['file'].values[0]
            file2 = t2['file'].values[0]
            ts = []
            
            ts.extend(zip3(f'Original Peaks', 'length', orig_length))
            ts.extend(zip3(f'Original Peaks', 'coverage', orig_covt))
            ts.extend(zip3(f'Original Peaks', 'control', orig_covc))
            ts.extend(zip3(f'Original Peaks', 'aggregated', orig_cova))
            ts.extend(zip3(f'Original Peaks', 'pbp coverage', [d(c, l) for c, l in zip(orig_covt, orig_length)]))
            ts.extend(zip3(f'Original Peaks', 'pbp control', [d(c, l) for c, l in zip(orig_covc, orig_length)]))
            ts.extend(zip3(f'Original Peaks', 'pbp aggregated', [d(c, l) for c, l in zip(orig_cova, orig_length)]))
            
            ts.extend(zip3(f'True Peaks (500)', 'length', tp_length))
            ts.extend(zip3(f'True Peaks (500)', 'coverage', tpcovt))
            ts.extend(zip3(f'True Peaks (500)', 'control', tpcovc))
            ts.extend(zip3(f'True Peaks (500)', 'aggregated', tpcova))
            ts.extend(zip3(f'True Peaks (500)', 'pbp coverage', [d(c, l) for c, l in zip(tpcovt, tp_length)]))
            ts.extend(zip3(f'True Peaks (500)', 'pbp control', [d(c, l) for c, l in zip(tpcovc, tp_length)]))
            ts.extend(zip3(f'True Peaks (500)', 'pbp aggregated', [d(c, l) for c, l in zip(tpcova, tp_length)]))

            peaks1, length1 = lines(file1), bedl(file1)
            ts.extend(zip3(f'{tool1} ({peaks1})', 'length', length1))            
            cov1t, cov1c, cov1a = coverage_t(file1), coverage_c(file1), coverage_a(file1)
            ts.extend(zip3(f'{tool1} ({peaks1})', 'coverage', cov1t))
            ts.extend(zip3(f'{tool1} ({peaks1})', 'control', cov1c))
            ts.extend(zip3(f'{tool1} ({peaks1})', 'aggregated', cov1a))
            ts.extend(zip3(f'{tool1} ({peaks1})', 'pbp coverage', [d(c, l) for c, l in zip(cov1t, length1)]))
            ts.extend(zip3(f'{tool1} ({peaks1})', 'pbp control', [d(c, l) for c, l in zip(cov1c, length1)]))
            ts.extend(zip3(f'{tool1} ({peaks1})', 'pbp aggregated', [d(c, l) for c, l in zip(cov1a, length1)]))


            peaks2, length2 = lines(file2), bedl(file2)
            ts.extend(zip3(f'{tool2} ({peaks2})', 'length', length2))
            cov2t, cov2c, cov2a = coverage_t(file2), coverage_c(file2), coverage_a(file2)
            ts.extend(zip3(f'{tool2} ({peaks2})', 'coverage', cov2t))
            ts.extend(zip3(f'{tool2} ({peaks2})', 'control', cov2c))
            ts.extend(zip3(f'{tool2} ({peaks2})', 'aggregated', cov2a))
            ts.extend(zip3(f'{tool2} ({peaks2})', 'pbp coverage', [d(c, l) for c, l in zip(cov2t, length2)]))
            ts.extend(zip3(f'{tool2} ({peaks2})', 'pbp control', [d(c, l) for c, l in zip(cov2c, length2)]))
            ts.extend(zip3(f'{tool2} ({peaks2})', 'pbp aggregated', [d(c, l) for c, l in zip(cov2a, length2)]))

            tf = f'{PATH2}/{q}_overlap_{tool1}_vs_TP.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {TRUE_PEAKS} -wa -u > {tf}
            overlap1, overlap1l = lines(tf), bedl(tf)
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'length', overlap1l))            
            overlap1t, overlap1c, overlap1a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'coverage', overlap1t))
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'control', overlap1c))
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'aggregated', overlap1a))
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'pbp coverage', [d(c, l) for c, l in zip(overlap1t, overlap1l)]))
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'pbp control', [d(c, l) for c, l in zip(overlap1c, overlap1l)]))
            ts.extend(zip3(f'overlap {tool1} vs TP ({overlap1})', 'pbp aggregated', [d(c, l) for c, l in zip(overlap1a, overlap1l)]))


            tf = f'{PATH2}/{q}_overlap_{tool2}_vs_TP.bed'.replace(' ', '_')
            !bedtools intersect -b {TRUE_PEAKS} -a {file2} -wa -u > {tf}
            overlap2, overlap2l = lines(tf), bedl(tf)
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'length', overlap2l))                        
            overlap2t, overlap2c, overlap2a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'coverage', overlap2t))
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'control', overlap2c))
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'aggregated', overlap2a))
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'pbp coverage', [d(c, l) for c, l in zip(overlap2t, overlap2l)]))
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'pbp control', [d(c, l) for c, l in zip(overlap2c, overlap2l)]))
            ts.extend(zip3(f'overlap {tool2} vs TP ({overlap2})', 'pbp aggregated', [d(c, l) for c, l in zip(overlap2a, overlap2l)]))
            

            tf = f'{PATH2}/{q}_overlap_{tool1}_vs_{tool2}.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {file2} -wa -u > {tf}
            overlap1, overlap1l = lines(tf), bedl(tf)
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'length', overlap1l))            
            overlap1t, overlap1c, overlap1a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'coverage', overlap1t))
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'control', overlap1c))
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'aggregated', overlap1a))
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'pbp coverage', [d(c, l) for c, l in zip(overlap1t, overlap1l)]))
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'pbp control', [d(c, l) for c, l in zip(overlap1c, overlap1l)]))
            ts.extend(zip3(f'overlap {tool1} vs {tool2} ({overlap1})', 'pbp aggregated', [d(c, l) for c, l in zip(overlap1a, overlap1l)]))


            tf = f'{PATH2}/{q}_overlap_{tool2}_vs_{tool1}.bed'.replace(' ', '_')
            !bedtools intersect -b {file1} -a {file2} -wa -u > {tf}
            overlap2, overlap2l = lines(tf), bedl(tf)
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'length', overlap2l))                        
            overlap2t, overlap2c, overlap2a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'coverage', overlap2t))
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'control', overlap2c))
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'aggregated', overlap2a))
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'pbp coverage', [d(c, l) for c, l in zip(overlap2t, overlap2l)]))
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'pbp control', [d(c, l) for c, l in zip(overlap2c, overlap2l)]))
            ts.extend(zip3(f'overlap {tool2} vs {tool1} ({overlap2})', 'pbp aggregated', [d(c, l) for c, l in zip(overlap2a, overlap2l)]))


            tf = f'{PATH2}/{q}_diff_{tool1}_vs_TP.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {TRUE_PEAKS} -v > {tf}
            diff1, diff1l = lines(tf), bedl(tf)
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'length', diff1l))                        
            diff1t, diff1c, diff1a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'coverage', diff1t))
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'control', diff1c))
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'aggregated', diff1a))
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'pbp coverage', [d(c, l) for c, l in zip(diff1t, diff1l)]))
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'pbp control', [d(c, l) for c, l in zip(diff1c, diff1l)]))
            ts.extend(zip3(f'diff {tool1} - TP ({diff1})', 'pbp aggregated', [d(c, l) for c, l in zip(diff1a, diff1l)]))

            tf = f'{PATH2}/{q}_diff_{tool2}_vs_TP.bed'.replace(' ', '_')        
            !bedtools intersect -b {TRUE_PEAKS} -a {file2} -v > {tf}
            diff2, diff2l = lines(tf), bedl(tf)
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'length', diff2l))            
            diff2t, diff2c, diff2a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'coverage', diff2t))
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'control', diff2c))
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'aggregated', diff2a))
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'pbp coverage', [d(c, l) for c, l in zip(diff2t, diff2l)]))
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'pbp control', [d(c, l) for c, l in zip(diff2c, diff2l)]))
            ts.extend(zip3(f'diff {tool2} - TP ({diff2})', 'pbp aggregated', [d(c, l) for c, l in zip(diff2a, diff2l)]))


            tf = f'{PATH2}/{q}_diff_{tool1}_vs_{tool2}.bed'.replace(' ', '_')
            !bedtools intersect -a {file1} -b {file2} -v > {tf}
            diff1, diff1l = lines(tf), bedl(tf)
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'length', diff1l))                        
            diff1t, diff1c, diff1a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'coverage', diff1t))
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'control', diff1c))
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'aggregated', diff1a))
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'pbp coverage', [d(c, l) for c, l in zip(diff1t, diff1l)]))
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'pbp control', [d(c, l) for c, l in zip(diff1c, diff1l)]))
            ts.extend(zip3(f'diff {tool1} - {tool2} ({diff1})', 'pbp aggregated', [d(c, l) for c, l in zip(diff1a, diff1l)]))

            tf = f'{PATH2}/{q}_diff_{tool2}_vs_{tool1}.bed'.replace(' ', '_')        
            !bedtools intersect -b {file1} -a {file2} -v > {tf}
            diff2, diff2l = lines(tf), bedl(tf)
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'length', diff2l))            
            diff2t, diff2c, diff2a = coverage_t(tf), coverage_c(tf), coverage_a(tf)
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'coverage', diff2t))
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'control', diff2c))
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'aggregated', diff2a))
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'pbp coverage', [d(c, l) for c, l in zip(diff2t, diff2l)]))
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'pbp control', [d(c, l) for c, l in zip(diff2c, diff2l)]))
            ts.extend(zip3(f'diff {tool2} - {tool1} ({diff2})', 'pbp aggregated', [d(c, l) for c, l in zip(diff2a, diff2l)]))

            ts = pd.DataFrame(ts, columns=['name', 'type', 'coverage'])

            # Plot
            plt.figure(figsize=(18, 5))
            t = ts.loc[ts['type'] == 'length'][['name', 'coverage']].copy()
            t.columns = ['name', 'length']
            t['length'].clip(upper=20000, inplace=True)
            ax = plt.subplot(1, 3, 1)
            sns.boxplot(data=t, x='name', y='length', ax=ax)
            ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
            plt.title(f'Q={q} Peaks lengths')

            t = ts.loc[[ty != 'length' and 'pbp' not in ty for ty in ts['type']]].copy()
            t['coverage'].clip(upper=2000, inplace=True)
            ax = plt.subplot(1, 3, 2)
            sns.boxplot(data=t, x='name', hue='type', y='coverage', ax=ax)
            ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
            plt.title(f'Q={q} Coverage in peaks')

            t = ts.loc[[ty != 'length' and 'pbp' in ty for ty in ts['type']]].copy()  
            t['coverage'].clip(upper=2.0, inplace=True)        
            ax = plt.subplot(1, 3, 3)
            sns.boxplot(data=t, x='name', hue='type', y='coverage', ax=ax)
            ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
            plt.title(f'Q={q} Coverage per basepair in peaks')
            
            plt.savefig(f'{PATH2}/figures/{q}_{tool1}_vs_{tool2}.png', bbox_inches='tight', dpi=300)        
            plt.show()



            k4me3_comparison.loc[len(k4me3_comparison)] = (
                n, tool1, tool2, 
                peaks1, peaks2, sum(length1), sum(length2), 
                overlap1, overlap2, sum(overlap1l), sum(overlap2l),
                diff1, diff2, sum(diff1l), sum(diff2l),
                sum(cov1t), sum(cov1c), sum(cov1a), 
                sum(cov2t), sum(cov2c), sum(cov2a), 
                sum(overlap1t), sum(overlap1c), sum(overlap1a),
                sum(overlap2t), sum(overlap2c), sum(overlap2a),
                sum(diff1t), sum(diff1c), sum(diff1a),
                sum(diff2t), sum(diff2c), sum(diff2a)
            )

display(k4me3_comparison)
k4me3_comparison.to_csv(f'{PATH2}/k4me3_comparison.csv', index=False)