# Analysis of ChIP-seq simulation

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
import os
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from collections import OrderedDict, Counter

In [None]:
WORK_DIR = '/mnt/stripe/shpynov/2021_chips'

FIGURES_DIR = '/mnt/stripe/shpynov/2021_chips/figures'
! mkdir -p {FIGURES_DIR}
! rm {FIGURES_DIR}/*

# Chips models analysis

In [None]:
import json

models_df = pd.DataFrame(columns=['Modification', 'Model', 'k', 'theta', 'prc_rate', 'f', 's'],
                         dtype=object)
for modification in ['H3K27ac', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'H3K4me3']:
    path = os.path.join(WORK_DIR, 'peaks', f'{modification}.json')
    with open(path) as f:
        m = json.load(f)
        k, theta, prc_rate, f, s = m['frag']['k'], m['frag']['theta'], m['pcr_rate'], \
            m['pulldown']['f'], m['pulldown']['s']
        models_df.loc[len(models_df)] = (modification, path, k, theta, prc_rate, f, s)
models_df.to_csv(os.path.join(WORK_DIR, 'models.tsv'), sep='\t', index=None)
models_df

In [None]:
fig = plt.figure(figsize=(6, 6))
sns.scatterplot(data=models_df, x='k', y='theta', hue='Modification', markers=True)
plt.title(f'Fragment lengths gamma distribution')
plt.savefig(os.path.join(FIGURES_DIR, f'models_fragments_gamma.pdf'))
plt.show()

In [None]:
fig = plt.figure(figsize=(6, 6))
sns.scatterplot(data=models_df, x='s', y='f', hue='Modification', markers=True)
plt.title(f'Fraction of Reads in Peaks vs Bound Genome Fraction')
plt.xlabel('Fraction of Reads in Peaks')
plt.ylabel('Bound Genome Fraction')
plt.savefig(os.path.join(FIGURES_DIR, f'models_frip_vs_bound_genome.pdf'))
plt.show()

# Report analysis

In [None]:
df = pd.read_csv(os.path.join(WORK_DIR, 'report.tsv'), sep='\t', comment='#')

# Focus on H3K4me3 now
# df = df.loc[df['Modification'] == 'H3K4me3']

# Focus on specific tools only
df = df.loc[df['Tool'].isin(set(['Macs2', 'Macs2Broad', 'SICER', 'SPAN-GAP5']))]


# Rename
df.loc[df['Tool']=='Macs2Broad', 'Tool'] = 'Macs2 broad'
df.loc[df['Tool']=='SPAN-GAP5', 'Tool'] = 'SPAN'

# Fix SPAN default naming to avoid duplicate record
df = df.loc[['_1E-6_' not in pf for pf in df['PeaksFile']]]
df['Fdr'] = df['Fdr'].astype(float)

# Remove relaxed FDR setting to avoid explosure in #peaks
# df = df.loc[df['Fdr'] <= 0.05]
df

In [None]:
df.sort_values(by=['Modification', 'Mult', 'Library', 'Tool', 'Fdr'], inplace=True)

In [None]:
df['TrueAverageLength'] = df['TrueLength'] / df['TruePeaks']
df['AverageLength'] = df['Length'] / df['Peaks']
df.fillna(0, inplace=True)

In [None]:
df['LogFdr'] = np.log10(df['Fdr'])
df.loc[df['Fdr']==0.05, 'LogFdr'] = -1.3

In [None]:
df['PrecisionPeaks'] = df['Precision'].copy()
df['RecallPeaks'] = df['Recall'].copy()
df['Precision'] = df['Precision'] / df['TruePeaks']
df['Recall'] = df['Recall'] / df['Peaks']
df.sort_values(by=['Recall', 'Precision'], ascending=[True, False], inplace=True)
df.fillna(1, inplace=True)

In [None]:
print('Limit peaks number for visual representation')
df.loc[df['Peaks'] > 2000, 'Peaks'] = 2000
print('Limit average length for visual representation')
df.loc[df['AverageLength'] > 15000, 'AverageLength'] = 15000

# Summary number of peaks

In [None]:
cmap = plt.cm.get_cmap('jet', len(set(df['Tool'])))
palette = {t: cmap(i) for i, t in enumerate(sorted(set(df['Tool'])))}

In [None]:
def summary_peaks_lengths(df, name):
    for m in sorted(set(df['Modification'])):
        t = df.loc[df['Modification'] == m].copy()
        t.sort_values(by=['Tool', 'Fdr'], inplace=True)

        fig = plt.figure(figsize=(15, 5))

        ax = plt.subplot(1, 2, 1)
        print('Peaks', m)
        print(f"True peaks {set(t['TruePeaks'])}")
        sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
        plt.title(f'{m} {name}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper left')
#             plt.ylim(-20, t['Peaks'].max() * 1.2)

        ax = plt.subplot(1, 2, 2)            
        print('Length', m)
        print(f"True average length {set(t['TrueAverageLength'])}")        
        sns.boxplot(data=t, x="Fdr", y="AverageLength", hue="Tool", palette=palette)
        plt.title(f'{m} {name}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper left')
#             plt.ylim(-20, t['AverageLength'].max() * 1.2)

        plt.savefig(os.path.join(FIGURES_DIR, f'summary_peaks_length_{name}_{m}.pdf'))
        plt.show()        

In [None]:
print('Summary number of peaks and length good quality')
summary_peaks_lengths(df.loc[(df['Library'] == '1mln') & (df['Mult'] == 1.0)], 'good_quality')

In [None]:
print('Summary number of peaks and length on low coverage')
summary_peaks_lengths(df.loc[df['Library'] == '200k'], 'low_coverage')

In [None]:
print('Summary number of peaks and length on low frip')
summary_peaks_lengths(df.loc[df['Mult'] == 0.2], 'low_frip')

In [None]:
list(df.loc[(df['Mult'] == 0.2) & (df['Peaks'] > 1500) & (df['Tool'] == 'SPAN-GAP5')].head()['PeaksFile'])

In [None]:
print('Summary number of peaks and length')
summary_peaks_lengths(df, 'all')

## Aggregated results

In [None]:
def aggregated_peaks_lengths(df):
    ts = []
    t = df.loc[(df['Library'] == '1mln') & (df['Mult'] == 1.0)].copy()
    t['Quality'] = 'Good'
    ts.append(t)
    t = df.loc[df['Mult'] == 0.2].copy()
    t['Quality'] = 'Low FRIP'
    ts.append(t)    
    t = df.loc[df['Library'] == '200k'].copy()
    t['Quality'] = 'Low Coverage'
    ts.append(t)
    df = pd.concat(ts)
    
    for q in sorted(set(df['Quality'])):
        t = df.loc[df['Quality'] == q].copy()
        t.sort_values(by=['Modification', 'Tool', 'Fdr'], inplace=True)

        fig = plt.figure(figsize=(12, 4))
        print('Peaks', q)
        print(f"True peaks {set(t['TruePeaks'])}")
        sns.catplot(data=t, x="Tool", y="Peaks", hue="Fdr", col="Modification", 
                    kind="box", aspect=.6, palette='mako')
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_peaks_{q}.pdf'))
        plt.show() 

        fig = plt.figure(figsize=(12, 4))
        print('Length', q)
        print(f"True average length {set(t['TrueAverageLength'])}")        
        sns.catplot(data=t, x="Tool", y="AverageLength", hue="Fdr", col="Modification", 
                    kind="box", aspect=.6, palette='mako')

        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_length_{q}.pdf'))
        plt.show()        

In [None]:
aggregated_peaks_lengths(df)

In [None]:
def aggregated_peaks_lengths_fdr(df, fdr):
    df = df.loc[df['Fdr'] == fdr]
    ts = []
    t = df.loc[(df['Library'] == '1mln') & (df['Mult'] == 1.0)].copy()
    t['Quality'] = 'Good'
    ts.append(t)
    t = df.loc[df['Mult'] == 0.2].copy()
    t['Quality'] = 'Low FRIP'
    ts.append(t)    
    t = df.loc[df['Library'] == '200k'].copy()
    t['Quality'] = 'Low Coverage'
    ts.append(t)
    df = pd.concat(ts)
    
    for q in sorted(set(df['Quality'])):
        t = df.loc[df['Quality'] == q].copy()
        t.sort_values(by=['Modification', 'Tool', 'Fdr'], inplace=True)

        fig = plt.figure(figsize=(12, 4))
        print('Peaks', q)
        print(f"True peaks {set(t['TruePeaks'])}")
        sns.boxplot(data=t, x="Modification", y="Peaks", hue="Tool", palette=palette)
        plt.title(f'Number of peaks {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper right')
        plt.ylim(-20, t['Peaks'].max() * 1.2)

        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_peaks_{q}_fdr{fdr}.pdf'))
        plt.show() 

        fig = plt.figure(figsize=(12, 4))
        print('Length', q)
        print(f"True average length {set(t['TrueAverageLength'])}")        
        sns.boxplot(data=t, x="Modification", y="AverageLength", hue="Tool", palette=palette)
        plt.title(f'Average length {q} fdr={fdr}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper right')
        plt.ylim(-20, t['AverageLength'].max() * 1.2)

        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_length_{q}_fdr{fdr}.pdf'))
        plt.show()        

In [None]:
aggregated_peaks_lengths_fdr(df, 0.05)

## Peaks vs Average length

In [None]:
def summary_vs(df, name, v1, v2):
    for m in sorted(set(df['Modification'])):
        print(m)
        t = df.loc[df['Modification'] == m]
        fig = plt.figure(figsize=(12, 6))
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for i in set(t['I']):
                    for tool in set(t['Tool']):
                        tt = t.loc[(df['Mult'] == mult) & (t['Library'] == lib) & (t['I'] == i)].copy()
                        tt.sort_values(by=['Tool', 'Fdr'], inplace=True)
                        sns.lineplot(data=tt, x=v1, y=v2, hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.2)
#             plt.ylim(-20, t['AverageLength'].max() * 1.2)
        # Single legend
        handles, labels = plt.axes().get_legend_handles_labels()
        plt.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                   labels=labels[:len(set(t['Tool'])) + 1],
                  loc='lower left')
        plt.title(f'{m} {name}')
        plt.savefig(os.path.join(FIGURES_DIR, f'summary_{v1}_vs_{v2}_{m}_{name}.pdf'))
        plt.show()

In [None]:
print('Peaks vs Average Length')
summary_vs(df.loc[(df['Library'] == '1mln') & (df['Mult'] == '_1.0')], 
           'good_quality', 'Peaks', 'AverageLength')

In [None]:
def peaks_lengths(df):
    for m in sorted(set(df['Modification'])):
        print('Peaks', m)
        print(f"True peaks {set(df[df['Modification'] == m]['TruePeaks'])}")
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
                plt.title(f'{m} {lib} {mult}')
                plt.xticks(rotation=45, ha='right')
                plt.ylim(-20, t['Peaks'].max() * 1.2)
                if i == 1:
                    ax.legend(loc='upper left')
                else:
                    ax.get_legend().remove()    
                i += 1
        plt.savefig(os.path.join(FIGURES_DIR, f'peaks_{m}.pdf'))
        plt.show()

        print('Length', m)
        print(f"True average length {set(df[df['Modification'] == m]['TrueAverageLength'])}")        
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                sns.boxplot(data=t, x="Fdr", y="AverageLength", hue="Tool", palette=palette)
                plt.title(f'{m} {lib} {mult}')
                plt.xticks(rotation=45, ha='right')
                plt.ylim(-20, t['AverageLength'].max() * 1.2)
                if i == 1:
                    ax.legend(loc='upper left')
                else:
                    ax.get_legend().remove()    
                i += 1
        plt.savefig(os.path.join(FIGURES_DIR, f'length_{m}.pdf'))
        plt.show()        

In [None]:
print('Summary number of peaks and length')
peaks_lengths(df)

## Precision / Recall

In [None]:
print('Detailed Precision / Recall plot')

for m in sorted(set(df['Modification'])):
    print(m)
    fig = plt.figure(figsize=(5 * 3, 5 * 3))
    i = 1
    for lib in ['1mln', '500k', '200k']:
        for mult in [1.0, 0.5, 0.2]:
            t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
            ax = plt.subplot(3, 3, i)
            for I in set(t['I']):
                tt = t.loc[t['I'] == I].copy()
                tt.sort_values(by=['Tool', 'Recall', 'Precision'], inplace=True)
                sns.lineplot(data=tt, x='Recall', y='Precision', hue='Tool', 
                             markers=True, style='I', palette=palette, alpha=0.5)
            if i == 1:
                # Single legend
                handles, labels = ax.get_legend_handles_labels()
                ax.legend(handles=handles[:len(set(tt['Tool'])) + 1], 
                          labels=labels[:len(set(tt['Tool'])) + 1],
                         loc='lower left')
            else:
                ax.get_legend().remove()

            i += 1
            plt.xlim(-0.1, 1.1)
            plt.ylim(-0.1, 1.1)
            plt.title(f'{m} {lib} {mult}')
    plt.savefig(os.path.join(FIGURES_DIR, f'prc_plot_{m}.pdf'))
    plt.show()

In [None]:
def precision_recall(df, name):
    for m in sorted(set(df['Modification'])):
        print('Peaks', name, m)
        t = df.loc[df['Modification'] == m]
        fig = plt.figure(figsize=(6, 6))
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for i in set(t['I']):
                    for tool in set(t['Tool']):
                        tt = t.loc[(df['Mult'] == mult) & (t['Library'] == lib) & (t['I'] == i)].copy()
                        tt.sort_values(by=['Tool', 'Recall', 'Precision'], inplace=True)
                        sns.lineplot(data=tt, x='Recall', y='Precision', hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.2)

        # Single legend
        handles, labels = ax.get_legend_handles_labels()
        plt.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                  labels=labels[:len(set(t['Tool'])) + 1],
                  loc='lower left')
        plt.xlim(min(0.6, t['Recall'].min()), 1.1)
        plt.ylim(min(0.6, t['Precision'].min()), 1.1)
        plt.title(f'{m} {name}')
        plt.savefig(os.path.join(FIGURES_DIR, f'precision_recall_{name}_{m}_{name}.pdf'))
        plt.show()

In [None]:
print('Precision / Recall')
precision_recall(df.loc[(df['Library'] == '1mln') & (df['Mult'] == 1.0)], 'good_quality')

In [None]:
print('Precision / Recall low coverage')
precision_recall(df.loc[df['Library'] == '200k'], 'low_coverage')

In [None]:
print('Precision / Recall low frip')
precision_recall(df.loc[df['Mult'] == 0.2], 'low_frip')

## Aggregated precision / recall by modification

In [None]:
def aggregated_precision_recall(df, name):
    print('Peaks', name)
    fig = plt.figure(figsize=(18, 3))
    for i, m in enumerate(sorted(set(df['Modification']))):
        ax = plt.subplot(1, 5, i+1)
        tt = df.loc[df['Modification'] == m]
        for lib in ['1mln', '500k', '200k']:
            for mult in [1.0, 0.5, 0.2]:
                for I in set(tt['I']):
                    for tool in set(tt['Tool']):
                        ttt = tt.loc[(tt['Mult'] == mult) & (tt['Library'] == lib) & (tt['I'] == I)].copy()
                        ttt.sort_values(by=['Tool', 'Recall', 'Precision'], inplace=True)
                        sns.lineplot(data=ttt, x='Recall', y='Precision', hue='Tool', 
                                     markers=True, style='I', palette=palette, alpha=0.1)

        if i == 0:
            # Single legend
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles[:len(set(tt['Tool'])) + 1], 
                      labels=labels[:len(set(tt['Tool'])) + 1],
                      loc='lower left')
        else:
            ax.get_legend().remove()
        plt.xlim(-0.1, 1.1)
        plt.ylim(-0.1, 1.1)
        plt.title(m)
    plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_precision_recall_{name}.pdf'))
    plt.show()

In [None]:
print('Precision / Recall')
aggregated_precision_recall(df.loc[(df['Library'] == '1mln') & (df['Mult'] == 1.0)], 'good_quality')

In [None]:
print('Precision / Recall low coverage')
aggregated_precision_recall(df.loc[df['Library'] == '200k'], 'low_coverage')

In [None]:
print('Precision / Recall low frip')
aggregated_precision_recall(df.loc[df['Mult'] == 0.2], 'low_frip')

In [None]:
# aggregated_precision_recall(df, 'All')

# Average precision

AP summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight:

AP = ∑(Rn−Rn−1)*Pn, where  Pn and Rn are the precision and recall at the nth threshold.

This implementation is not interpolated and is different from computing the area under the precision-recall curve with the trapezoidal rule, which uses linear interpolation and can be too optimistic.

In [None]:
dfap = pd.DataFrame(columns=['Modification', 'Library', 'Mult', 'I', 'Tool', 'AP'], dtype=object)
for m in sorted(set(df['Modification'])):
    print(m)
    for lib in ['1mln', '500k', '200k']:
        for mult in [1.0, 0.5, 0.2]:
            t = df.loc[(df['Modification'] == m) & (df['Mult'] == mult) & (df['Library'] == lib)]
            for tool in sorted(set(t['Tool'])):
                for i in sorted(set(t['I'])):
                    tt = t.loc[(t['Tool']==tool) & (t['I'] == i)].copy()
                    tt.sort_values(by=['Recall', 'Precision'], inplace=True)
                    ap = 0
                    rprev = 0
                    for _, row in tt.iterrows():
                        ap += (row['Recall'] - rprev)*row['Precision']
                        rprev = row['Recall']
                    dfap.loc[len(dfap)] = (m, lib, mult, i, tool, ap)                


In [None]:
for m in sorted(set(dfap['Modification'])):
    print(m)
    fig = plt.figure(figsize=(5 * 3, 5 * 3))
    i = 1
    for lib in ['1mln', '500k', '200k']:
        for mult in [1.0, 0.5, 0.2]:
            tt = dfap.loc[(dfap['Modification'] == m) & 
                          (dfap['Mult'] == mult) & (dfap['Library'] == lib)].copy()
            ax = plt.subplot(3, 3, i)
            tt.sort_values(by=['Tool'], inplace=True)
            sns.boxplot(x= 'Tool', y='AP', data=tt, palette=palette)
            plt.title(f'{m} {lib} {mult}')
            if i == 1:
                ax.legend(loc='lower left')
            else:
                pass
#                 ax.get_legend().remove()
            i += 1
            plt.ylim(-0.1, 1.1)
    plt.savefig(os.path.join(FIGURES_DIR, f'ap_{m}.pdf'))
    plt.show()

# Aggregated AP score

In [None]:
def plot_aggregated_ap(dfap):
    ts = []
    t = dfap.loc[(dfap['Library'] == '1mln') & (dfap['Mult'] == 1.0)].copy()
    t['Quality'] = 'Good'
    ts.append(t)
    t = dfap.loc[dfap['Mult'] == 0.2].copy()
    t['Quality'] = 'Low FRIP'
    ts.append(t)    
    t = dfap.loc[dfap['Library'] == '200k'].copy()
    t['Quality'] = 'Low Coverage'
    ts.append(t)
    t = dfap.copy()
    t['Quality'] = 'All'
    ts.append(t)

    dfap = pd.concat(ts)

    for q in sorted(set(dfap['Quality'])):
        print(q)
        fig = plt.figure(figsize=(12, 4))
        tt = dfap.loc[(dfap['Quality'] == q)].copy()
        tt.sort_values(by=['Tool'], inplace=True)
        sns.boxplot(x='Modification', y='AP', hue='Tool', data=tt)
#         plt.title(f'{m} {name}')
#         ax.legend(loc='lower left')
        plt.ylim(-0.1, 1.1)
        plt.savefig(os.path.join(FIGURES_DIR, f'aggregated_ap_{q}.pdf'))
        plt.show()


In [None]:
plot_aggregated_ap(dfap)