# Analysis of ChIP-seq simulation

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
from collections import OrderedDict, Counter

In [None]:
df = pd.read_csv('/mnt/stripe/shpynov/2021_chips/report.tsv', sep='\t', comment='#')
df = df.loc[df['PeaksSource'].isin(set(['macs2', 'sicer']))]
df = df.loc[df['Tool'] != 'SPAN-GAP0']
df['Fdr'] = df['Fdr'].astype(float)
df.sample(10)

In [None]:
df.sort_values(by=['Modification', 'PeaksSource', 'Mult', 'Library', 'Tool', 'Fdr'], inplace=True)

In [None]:
df['TrueAverageLength'] = df['TrueLength'] / df['TruePeaks']
df['AverageLength'] = df['Length'] / df['Peaks']
df.fillna(0, inplace=True)
df.sample(10)

In [None]:
df['LogFdr'] = np.log10(df['Fdr'])
df.loc[df['Fdr']==0.05, 'LogFdr'] = -1.3

# Summary number of peaks

In [None]:
cmap = plt.cm.get_cmap('jet', len(set(df['Tool'])))
palette = {t: cmap(i) for i, t in enumerate(sorted(set(df['Tool'])))}

In [None]:
# print('Single launch only')

# sources = len(set(df['PeaksSource']))
# for lib in ['1mln', '500k', '100k']:
#     for mult in ['_1.0', '_0.5', '_0.1']:
#         print(mult, lib)
#         t = df.loc[np.logical_and(df['Mult'] == mult, df['Library'] == lib)]
#         for m in sorted(set(df['Modification'])):
#             for I in [1]: #sorted(set(df['I'])):
#                 fig = plt.figure(figsize=(8 * sources, 8))
#                 tt = t.loc[np.logical_and(t['Modification']==m, t['I']==I)]
#                 print(I)
#                 for i, ps in enumerate(sorted(set(df['PeaksSource']))):
#                     ax = plt.subplot(1, sources, i + 1)
#                     ttt = tt.loc[tt['PeaksSource'] == ps]
#                     sns.lineplot(data=ttt, x="LogFdr", y="Peaks", hue="Tool", palette=palette)
#                     plt.title(f'{m} {ps} {t["TruePeaks"].values[0]}')
#                     if len(ttt):
#                         plt.ylim(0, ttt['Peaks'].max())
#                         if i == 0:
#                             ax.legend(loc='lower left')
#                         else:
#                             ax.get_legend().remove()    
#                 plt.show()

In [None]:
print('Limit peaks number')
df.loc[df['Peaks'] > 2000, 'Peaks'] = 2000
print('Limit average length')
df.loc[df['AverageLength'] > 10000, 'AverageLength'] = 10000



print('Summary number of peaks and length')
for ps in sorted(set(df['PeaksSource'])):
    for m in sorted(set(df['Modification'])):
        print('Peaks', ps, m)
        print(f"True peaks {set(df[(df['PeaksSource'] == ps) & (df['Modification'] == m)]['TruePeaks'])}")
        fig = plt.figure(figsize=(10, 5))
        t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m)]
        sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
        plt.title(f'{m} {ps}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper left')
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_peaks_{m}_{ps}.png')
        plt.show()

        print('Length', ps, m)
        print(f"True average length {set(df[(df['PeaksSource'] == ps) & (df['Modification'] == m)]['TrueAverageLength'])}")        
        fig = plt.figure(figsize=(10, 5))
        t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m)]
        sns.boxplot(data=t, x="Fdr", y="Length", hue="Tool", palette=palette)
        plt.title(f'{m} {ps}')
        plt.xticks(rotation=45, ha='right')
        plt.legend(loc='upper left')
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_length_{m}_{ps}.png')
        plt.show()        

In [None]:
print('Limit peaks number')
df.loc[df['Peaks'] > 2000, 'Peaks'] = 2000
print('Limit average length')
df.loc[df['AverageLength'] > 10000, 'AverageLength'] = 10000



print('Summary number of peaks and length')
for ps in sorted(set(df['PeaksSource'])):
    for m in sorted(set(df['Modification'])):
        print('Peaks', ps, m)
        print(f"True peaks {set(df[(df['PeaksSource'] == ps) & (df['Modification'] == m)]['TruePeaks'])}")
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in ['_1.0', '_0.5', '_0.2']:
                t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) & 
                       (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                sns.boxplot(data=t, x="Fdr", y="Peaks", hue="Tool", palette=palette)
                plt.title(f'{m} {ps} {lib} {mult}')
                plt.xticks(rotation=45, ha='right')
                plt.ylim(-20, t['Peaks'].max() * 1.2)
                if i == 1:
                    ax.legend(loc='upper left')
                else:
                    ax.get_legend().remove()    
                i += 1
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/peaks_{m}_{ps}.png')
        plt.show()

        print('Length', ps, m)
        print(f"True average length {set(df[(df['PeaksSource'] == ps) & (df['Modification'] == m)]['TrueAverageLength'])}")        
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in ['_1.0', '_0.5', '_0.2']:
                t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) &
                       (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                sns.boxplot(data=t, x="Fdr", y="Length", hue="Tool", palette=palette)
                plt.title(f'{m} {ps} {lib} {mult}')
                plt.xticks(rotation=45, ha='right')
                plt.ylim(-20, t['Length'].max() * 1.2)
                if i == 1:
                    ax.legend(loc='upper left')
                else:
                    ax.get_legend().remove()    
                i += 1
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/length_{m}_{ps}.png')
        plt.show()        

## Precision / Recall

In [None]:
df['PrecisionF'] = df['Precision'] / df['TruePeaks']
df['RecallF'] = df['Recall'] / df['Peaks']
df.sort_values(by=['RecallF', 'PrecisionF'], ascending=[True, False], inplace=True)
df.fillna(1, inplace=True)
df

In [None]:
# Precision / Recall plot

for ps in sorted(set(df['PeaksSource'])):
    for m in sorted(set(df['Modification'])):
        print(ps, m)
        fig = plt.figure(figsize=(5 * 3, 5 * 3))
        i = 1
        for lib in ['1mln', '500k', '200k']:
            for mult in ['_1.0', '_0.5', '_0.2']:
                t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) &
                       (df['Mult'] == mult) & (df['Library'] == lib)]
                ax = plt.subplot(3, 3, i)
                for I in set(t['I']):
                    tttt = t.loc[t['I'] == I].copy()
                    tttt.sort_values(by=['Tool', 'PrecisionF', 'RecallF'], inplace=True)
                    sns.lineplot(data=tttt, x="PrecisionF", y="RecallF", hue='Tool', palette=palette, alpha=0.2)
                if i == 1:
                    ax.legend(loc='lower left')
                    # Single legend
                    handles, labels = ax.get_legend_handles_labels()
                    ax.legend(handles=handles[:len(set(tttt['Tool']))], labels=labels[:len(set(tttt['Tool']))])
                else:
                    ax.get_legend().remove()

                i += 1
                plt.xlim(-0.1, 1.1)
                plt.ylim(-0.1, 1.1)
                plt.title(f'{m} {ps} {lib} {mult}')
        plt.savefig(f'/mnt/stripe/shpynov/2021_chips/prc_plot_{m}_{ps}.png')
        plt.show()

In [None]:
# Investigate bad sampling
df.loc[((df['PrecisionF'] > 0) & (df['PrecisionF'] < 0.2) & (df['RecallF'] > 0) & (df['RecallF'] < 0.2))]

In [None]:
# for lib in set(df['Library']):
#     print(lib)
#     t = df.loc[df['Library'] == lib].copy()
#     for m in sorted(set(df['Modification'])):
#         fig = plt.figure(figsize=(8 * sources, 8))
#         tt = t.loc[t['Modification']==m].copy()
#         for i, ps in enumerate(sorted(set(df['PeaksSource']))):
#             ax = plt.subplot(1, sources, i + 1)
#             ttt = tt.loc[tt['PeaksSource'] == ps].copy()
#             ttt.sort_values(by=['Tool'], inplace=True)
#             sns.scatterplot(data=ttt, x='PrecisionF', y='RecallF', hue='Tool', alpha=0.2, palette=palette)
#             plt.title(f'{m} {ps} {tt["TruePeaks"].values[0]}')
#             if i == 0:
#                 ax.legend(loc='lower left')
#             else:
#                 ax.get_legend().remove()
#         plt.show()

# Average precision

AP summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold, with the increase in recall from the previous threshold used as the weight:

AP = ∑(Rn−Rn−1)*Pn, where  Pn and Rn are the precision and recall at the nth threshold.

This implementation is not interpolated and is different from computing the area under the precision-recall curve with the trapezoidal rule, which uses linear interpolation and can be too optimistic.

In [None]:
dfap = pd.DataFrame(columns=['Modification', 'PeaksSource', 'Library', 'Mult', 'I', 'Tool', 'AP'])
for ps in sorted(set(df['PeaksSource'])):
    for m in sorted(set(df['Modification'])):
        print(ps, m)
        for lib in ['1mln', '500k', '200k']:
            for mult in ['_1.0', '_0.5', '_0.2']:
                t = df.loc[(df['PeaksSource'] == ps) & (df['Modification'] == m) &
                       (df['Mult'] == mult) & (df['Library'] == lib)]
                for tool in sorted(set(t['Tool'])):
                    for i in sorted(set(t['I'])):
                        tt = t.loc[(t['Tool']==tool) & (t['I'] == i)].copy()
                        tt.sort_values(by=['RecallF', 'PrecisionF'], inplace=True)
                        ap = 0
                        rprev = 0
                        for _, row in tt.iterrows():
                            ap += (row['RecallF'] - rprev)*row['PrecisionF']
                            rprev = row['RecallF']
#                         display(tt)
#                         print(ap)
#                         raise Exception("STOP")
                        dfap.loc[len(dfap)] = (m, ps, lib, mult, i, tool, ap)                


In [None]:
dfap

In [None]:
for m in sorted(set(dfap['Modification'])):
    print(m)
    fig = plt.figure(figsize=(10, 5))
    tt = dfap.loc[(dfap['Modification'] == m)].copy()
    tt.sort_values(by=['Tool'], inplace=True)
    sns.boxplot(x='PeaksSource', y='AP', hue='Tool', data=tt, palette=palette)
    plt.title(f'{m}')
    ax.legend(loc='lower left')
    plt.savefig(f'/mnt/stripe/shpynov/2021_chips/summary_ap_{m}.png')    
    plt.show()

In [None]:
for m in sorted(set(dfap['Modification'])):
    print(m)
    fig = plt.figure(figsize=(5 * 3, 5 * 3))
    i = 1
    for lib in ['1mln', '500k', '200k']:
        for mult in ['_1.0', '_0.5', '_0.2']:
            tt = dfap.loc[(dfap['Modification'] == m) & 
                          (dfap['Mult'] == mult) & (dfap['Library'] == lib)].copy()
            ax = plt.subplot(3, 3, i)
            tt.sort_values(by=['Tool'], inplace=True)
            sns.boxplot(x='PeaksSource', y='AP', hue='Tool', data=tt, palette=palette)
            plt.title(f'{m} {lib} {mult}')
            if i == 1:
                ax.legend(loc='lower left')
            else:
                ax.get_legend().remove()
            i += 1
            plt.ylim(-0.1, 1.1)
    plt.savefig(f'/mnt/stripe/shpynov/2021_chips/ap_{m}.png')            
    plt.show()