# Analysis of ChIP-seq simulation

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from collections import OrderedDict, Counter

In [None]:
df = pd.read_csv('/mnt/stripe/shpynov/2021_chips/report.tsv', sep='\t', comment='#')

# Focus on H3K4me3 now
df = df.loc[df['PeaksSource'].isin(set(['macs2']))]
df = df.loc[df['Modification'] == 'H3K4me3']
df = df.loc[df['Tool'].isin(set(['Macs2', 'Macs2Broad', 'SICER', 'SPAN-GAP5']))]

# Fix SPAN default naming to avoid duplicate record
df = df.loc[['_1E-6_' not in pf for pf in df['PeaksFile']]]
df['Fdr'] = df['Fdr'].astype(float)

# Remove relaxed FDR setting to avoid explosure in #peaks
df = df.loc[df['Fdr'] <= 0.01]
df

In [None]:
df.sort_values(by=['Modification', 'PeaksSource', 'Mult', 'Library', 'Tool', 'Fdr'], inplace=True)

In [None]:
df['TrueAverageLength'] = df['TrueLength'] / df['TruePeaks']
df['AverageLength'] = df['Length'] / df['Peaks']
df.fillna(0, inplace=True)

In [None]:
df['LogFdr'] = np.log10(df['Fdr'])
df.loc[df['Fdr']==0.05, 'LogFdr'] = -1.3

In [None]:
df['PrecisionF'] = df['Precision'] / df['TruePeaks']
df['RecallF'] = df['Recall'] / df['Peaks']
df.sort_values(by=['RecallF', 'PrecisionF'], ascending=[True, False], inplace=True)
df.fillna(1, inplace=True)

In [None]:
print('Limit peaks number')
df.loc[df['Peaks'] > 3000, 'Peaks'] = 3000
print('Limit average length')
df.loc[df['AverageLength'] > 10000, 'AverageLength'] = 10000

# Summary number of peaks

In [None]:
cmap = plt.cm.get_cmap('jet', len(set(df['Tool'])))
palette = {t: cmap(i) for i, t in enumerate(sorted(set(df['Tool'])))}

In [None]:
def summary_peaks_lengths(df, name):
    for ps in sorted(set(df['PeaksSource'])):
        for m in sorted(set(df['Modification'])):
            t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m)].copy()
            t.sort_values(by=['Tool', 'Fdr'], inplace=True)
    
            fig = plt.figure(figsize=(15, 5))

            ax = plt.subplot(1, 2, 1)
            print('Peaks', ps, m)
            print(f"True peaks {set(t['TruePeaks'])}")
            sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
            plt.title(f'{m} {ps} {name}')
            plt.xticks(rotation=45, ha='right')
            plt.legend(loc='upper left')
#             plt.ylim(-20, t['Peaks'].max() * 1.2)

            ax = plt.subplot(1, 2, 2)            
            print('Length', ps, m)
            print(f"True average length {set(t['TrueAverageLength'])}")        
            sns.boxplot(data=t, x="Fdr", y="AverageLength", hue="Tool", palette=palette)
            plt.title(f'{m} {ps} {name}')
            plt.xticks(rotation=45, ha='right')
            plt.legend(loc='upper left')
#             plt.ylim(-20, t['AverageLength'].max() * 1.2)
            
            plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_peaks_length_{name}_{m}_{ps}.png')
            plt.show()        

In [None]:
# t = t.loc[(t['Modification'] == 'H3K4me1') & (t['PeaksSource'] == 'macs2') & (t['Peaks'] >= 1800) & (t['Tool'] == 'SPAN-GAP5')]
# display(t)
# print('\n'.join(sorted(list(t['PeaksFile']))))

In [None]:
print('Summary number of peaks and length good quality')
summary_peaks_lengths(df.loc[(df['Library'] == '1mln') & (df['Mult'] == '_1.0')], 'good_quality')

In [None]:
print('Summary number of peaks and length on low coverage')
summary_peaks_lengths(df.loc[df['Library'] == '200k'], 'low_coverage')

In [None]:
print('Summary number of peaks and length on low frip')
summary_peaks_lengths(df.loc[df['Mult'] == '_0.2'], 'low_frip')

In [None]:
list(df.loc[(df['Mult'] == '_0.2') & (df['Peaks'] > 1500) & (df['Tool'] == 'SPAN-GAP5')].head()['PeaksFile'])

In [None]:
print('Summary number of peaks and length')
summary_peaks_lengths(df, 'all')

## Peaks vs Average length

In [None]:
def summary_vs(df, name, v1, v2):
    for ps in sorted(set(df['PeaksSource'])):
        for m in sorted(set(df['Modification'])):
            print(ps, m)
            t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m)]
            fig = plt.figure(figsize=(12, 6))
            for lib in ['1mln', '500k', '200k']:
                for mult in ['_1.0', '_0.5', '_0.2']:
                    for i in set(t['I']):
                        for tool in set(t['Tool']):
                            tt = t.loc[(df['Mult'] == mult) & (df['Library'] == lib) & (t['I'] == i)].copy()
                            tt.sort_values(by=['Tool', 'Fdr'], inplace=True)
                            sns.lineplot(data=tt, x=v1, y=v2, hue='Tool', 
                                         markers=True, style='I', palette=palette, alpha=0.2)
#             plt.ylim(-20, t['AverageLength'].max() * 1.2)
            # Single legend
            handles, labels = plt.axes().get_legend_handles_labels()
            plt.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                       labels=labels[:len(set(t['Tool'])) + 1],
                      loc='lower left')
            plt.title(f'{m} {ps} {name}')
            plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_{v1}_vs_{v2}_{m}_{ps}_{name}.png')
            plt.show()

In [None]:
print('Peaks vs Average Length')
summary_vs(df.loc[(df['Library'] == '1mln') & (df['Mult'] == '_1.0')], 
           'good_quality', 'Peaks', 'AverageLength')

In [None]:
def peaks_lengths(df):
    for ps in sorted(set(df['PeaksSource'])):
        for m in sorted(set(df['Modification'])):
            print('Peaks', ps, m)
            print(f"True peaks {set(df[(df['PeaksSource'] == ps) & (df['Modification'] == m)]['TruePeaks'])}")
            fig = plt.figure(figsize=(5 * 3, 5 * 3))
            i = 1
            for lib in ['1mln', '500k', '200k']:
                for mult in ['_1.0', '_0.5', '_0.2']:
                    t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) & 
                           (df['Mult'] == mult) & (df['Library'] == lib)]
                    ax = plt.subplot(3, 3, i)
                    sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
                    plt.title(f'{m} {ps} {lib} {mult}')
                    plt.xticks(rotation=45, ha='right')
                    plt.ylim(-20, t['Peaks'].max() * 1.2)
                    if i == 1:
                        ax.legend(loc='upper left')
                    else:
                        ax.get_legend().remove()    
                    i += 1
            plt.savefig(f'/mnt/stripe/shpynov/2021_chips/peaks_{m}_{ps}.png')
            plt.show()

            print('Length', ps, m)
            print(f"True average length {set(df[(df['PeaksSource'] == ps) & (df['Modification'] == m)]['TrueAverageLength'])}")        
            fig = plt.figure(figsize=(5 * 3, 5 * 3))
            i = 1
            for lib in ['1mln', '500k', '200k']:
                for mult in ['_1.0', '_0.5', '_0.2']:
                    t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) &
                           (df['Mult'] == mult) & (df['Library'] == lib)]
                    ax = plt.subplot(3, 3, i)
                    sns.boxplot(data=t, x="Fdr", y="AverageLength", hue="Tool", palette=palette)
                    plt.title(f'{m} {ps} {lib} {mult}')
                    plt.xticks(rotation=45, ha='right')
                    plt.ylim(-20, t['Length'].max() * 1.2)
                    if i == 1:
                        ax.legend(loc='upper left')
                    else:
                        ax.get_legend().remove()    
                    i += 1
            plt.savefig(f'/mnt/stripe/shpynov/2021_chips/length_{m}_{ps}.png')
            plt.show()        

In [None]:
print('Summary number of peaks and length')
peaks_lengths(df)

## Precision / Recall

In [None]:
print('Detailed Precision / Recall plot')

for ps in sorted(set(df['PeaksSource'])):
    for m in sorted(set(df['Modification'])):
        print(ps, m)
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in ['_1.0', '_0.5', '_0.2']:
                t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) &
                       (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                for I in set(t['I']):
                    tt = t.loc[t['I'] == I].copy()
                    tt.sort_values(by=['Tool', 'PrecisionF', 'RecallF'], inplace=True)
                    sns.lineplot(data=tt, x="PrecisionF", y="RecallF", hue='Tool', 
                                 markers=True, style='I', palette=palette, alpha=0.5)
                if i == 1:
                    # Single legend
                    handles, labels = ax.get_legend_handles_labels()
                    ax.legend(handles=handles[:len(set(tt['Tool'])) + 1], 
                              labels=labels[:len(set(tt['Tool'])) + 1],
                             loc='lower left')
                else:
                    ax.get_legend().remove()

                i += 1
                plt.xlim(-0.1, 1.1)
                plt.ylim(-0.1, 1.1)
                plt.title(f'{m} {ps} {lib} {mult}')
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/prc_plot_{m}_{ps}.png')
        plt.show()

In [None]:
# Investigate bad precision/recall samples
df.loc[((df['PrecisionF'] > 0) & (df['PrecisionF'] < 0.2) & (df['RecallF'] > 0) & (df['RecallF'] < 0.2))]

In [None]:
def precision_recall(df, name):
    for ps in sorted(set(df['PeaksSource'])):
        for m in sorted(set(df['Modification'])):
            print('Peaks', name, ps, m)
            t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m)]
            fig = plt.figure(figsize=(6, 6))
            for lib in ['1mln', '500k', '200k']:
                for mult in ['_1.0', '_0.5', '_0.2']:
                    for i in set(t['I']):
                        for tool in set(t['Tool']):
                            tt = t.loc[(df['Mult'] == mult) & (df['Library'] == lib) & (t['I'] == i)].copy()
                            tt.sort_values(by=['Tool', 'Fdr'], inplace=True)
                            sns.lineplot(data=tt, x='PrecisionF', y='RecallF', hue='Tool', 
                                         markers=True, style='I', palette=palette, alpha=0.2)

            # Single legend
            handles, labels = ax.get_legend_handles_labels()
            plt.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                      labels=labels[:len(set(t['Tool'])) + 1],
                      loc='lower left')
            plt.xlim(min(0.6, t['PrecisionF'].min()), 1.1)
            plt.ylim(min(0.6, t['RecallF'].min()), 1.1)
            plt.title(f'{m} {ps} {name}')
            plt.savefig(f'/mnt/stripe/shpynov/2021_chips/precision_recall_{name}_{m}_{ps}_{name}.png')
            plt.show()

In [None]:
print('Precision / Recall')
precision_recall(df.loc[(df['Library'] == '1mln') & (df['Mult'] == '_1.0')], 'good_quality')

In [None]:
print('Precision / Recall low coverage')
precision_recall(df.loc[df['Library'] == '200k'], 'low_coverage')

In [None]:
print('Precision / Recall low frip')
precision_recall(df.loc[df['Mult'] == '_0.2'], 'low_frip')

In [None]:
def peaks_precision_recall(df, name):
    for ps in sorted(set(df['PeaksSource'])):
        for m in sorted(set(df['Modification'])):
            print('Peaks', name, ps, m)
            t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m)]
            fig = plt.figure(figsize=(15, 6))
            
            ax = plt.subplot(1, 2, 1)
            for lib in ['1mln', '500k', '200k']:
                for mult in ['_1.0', '_0.5', '_0.2']:
                    for i in set(t['I']):
                        for tool in set(t['Tool']):
                            tt = t.loc[(df['Mult'] == mult) & (df['Library'] == lib) & (t['I'] == i)].copy()
                            tt.sort_values(by=['Tool', 'Fdr'], inplace=True)
                            sns.lineplot(data=tt, x='Peaks', y='PrecisionF', hue='Tool', 
                                         markers=True, style='I', palette=palette, alpha=0.2)
            # Single legend
            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles[:len(set(t['Tool'])) + 1], 
                      labels=labels[:len(set(t['Tool'])) + 1],
                     loc='lower left')
            
            ax = plt.subplot(1, 2, 2)
            for lib in ['1mln', '500k', '200k']:
                for mult in ['_1.0', '_0.5', '_0.2']:
                    for i in set(t['I']):
                        for tool in set(t['Tool']):
                            tt = t.loc[(df['Mult'] == mult) & (df['Library'] == lib) & (t['I'] == i)].copy()
                            tt.sort_values(by=['Tool', 'Fdr'], inplace=True)
                            sns.lineplot(data=tt, x='Peaks', y='RecallF', hue='Tool', 
                                         markers=True, style='I', palette=palette, alpha=0.2)

            ax.get_legend().remove()    

            plt.title(f'{m} {ps} {name}')
            plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_peaks_precision_recall_{name}_{m}_{ps}_{name}.png')
            plt.show()

In [None]:
print('Peaks vs Precision / Recall')
peaks_precision_recall(df.loc[(df['Library'] == '1mln') & (df['Mult'] == '_1.0')], 'good_quality')

In [None]:
print('Peaks vs Precision / Recall Low coverage')
peaks_precision_recall(df.loc[df['Library'] == '200k'], 'low_coverage')

In [None]:
print('Peaks vs Precision / Recall Low frip')
peaks_precision_recall(df.loc[df['Mult'] == '_0.2'], 'low_frip')

# Average precision

AP summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight:

AP = ∑(Rn−Rn−1)*Pn, where  Pn and Rn are the precision and recall at the nth threshold.

This implementation is not interpolated and is different from computing the area under the precision-recall curve with the trapezoidal rule, which uses linear interpolation and can be too optimistic.

In [None]:
dfap = pd.DataFrame(columns=['Modification', 'PeaksSource', 'Library', 'Mult', 'I', 'Tool', 'AP'], dtype=object)
for ps in sorted(set(df['PeaksSource'])):
    for m in sorted(set(df['Modification'])):
        print(ps, m)
        for lib in ['1mln', '500k', '200k']:
            for mult in ['_1.0', '_0.5', '_0.2']:
                t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) &
                       (df['Mult'] == mult) & (df['Library'] == lib)]
                for tool in sorted(set(t['Tool'])):
                    for i in sorted(set(t['I'])):
                        tt = t.loc[(t['Tool']==tool) & (t['I'] == i)].copy()
                        tt.sort_values(by=['RecallF', 'PrecisionF'], inplace=True)
                        if m == 'H3K36me3' and ps == 'sicer' and lib == '200k' and mult == '_1.0' and \
                            i == 2 and tool == 'Macs2':
                            display(tt)
                        ap = 0
                        rprev = 0
                        for _, row in tt.iterrows():
                            ap += (row['RecallF'] - rprev)*row['PrecisionF']
                            rprev = row['RecallF']
#                         display(tt)
#                         print(ap)
#                         raise Exception("STOP")
                        dfap.loc[len(dfap)] = (m, ps, lib, mult, i, tool, ap)                


In [None]:
def plot_summary_ap(dfap, name):
    for m in sorted(set(dfap['Modification'])):
        print(m)
        fig = plt.figure(figsize=(10, 5))
        tt = dfap.loc[(dfap['Modification'] == m)].copy()
        tt.sort_values(by=['Tool', 'PeaksSource'], inplace=True)
        sns.boxplot(x='PeaksSource', y='AP', hue='Tool', data=tt, palette=palette)
        plt.title(f'{m} {name}')
        ax.legend(loc='lower left')
        plt.ylim(-0.1, 1.1)
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_ap_{name}_{m}.png')    
        plt.show()

In [None]:
print('Good quality')
plot_summary_ap(dfap.loc[(dfap['Library'] == '1mln') & (dfap['Mult'] == '_1.0')], 'good_quality')

In [None]:
print('Low coverage AP')
plot_summary_ap(dfap.loc[dfap['Library'] == '200k'], 'low_coverage')

In [None]:
print('Low frip AP')
plot_summary_ap(dfap.loc[dfap['Mult'] == '_0.2'], 'low_frip')

In [None]:
print('Summary AP')
plot_summary_ap(dfap, 'all')

In [None]:
for m in sorted(set(dfap['Modification'])):
    print(m)
    fig = plt.figure(figsize=(5 * 3, 5 * 3))
    i = 1
    for lib in ['1mln', '500k', '200k']:
        for mult in ['_1.0', '_0.5', '_0.2']:
            tt = dfap.loc[(dfap['Modification'] == m) & 
                          (dfap['Mult'] == mult) & (dfap['Library'] == lib)].copy()
            ax = plt.subplot(3, 3, i)
            tt.sort_values(by=['Tool'], inplace=True)
            sns.boxplot(x='PeaksSource', y='AP', hue='Tool', data=tt, palette=palette)
            plt.title(f'{m} {lib} {mult}')
            if i == 1:
                ax.legend(loc='lower left')
            else:
                ax.get_legend().remove()
            i += 1
            plt.ylim(-0.1, 1.1)
    plt.savefig(f'/mnt/stripe/shpynov/2021_chips/ap_{m}.png')            
    plt.show()