# Summary peak calling table processing

In order to create Peak Calling Summary table, collect all the data using the following script.
```
# Y20O20
run PeakCallerTuning experiments/src/main/resources/datasets/Y20O20.yaml -zsb

# ENCODE
run PeakCallerTuning experiments/src/main/resources/datasets/cd14encode.yaml -zsb
```

In [5]:
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
import numpy as np
from statistics import stdev
from collections import OrderedDict, Counter


df = pd.read_csv('/mnt/stripe/bio/experiments/configs/Y20O20/benchmark/Y20O20_peaks_summary.tsv', 
                 sep='\t', comment='#')
# Ignore Failed tracks
df = df.loc[df['status'] != 'failed']
display(df.head(3))

Unnamed: 0,donor,modification,tool,peaks,length,frip,procedure,params,file,status
1,OD13,H3K27ac,ZINBRA,23136,27689000,0.39998,tuned,200_1.0E-12_5_peaks,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,ok
2,OD4,H3K27ac,ZINBRA,26391,35877200,0.448657,tuned,200_1.0E-10_5_peaks,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,ok
3,OD5,H3K27ac,ZINBRA,21659,31826200,0.145641,tuned,200_1.0E-6_5_peaks,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,ok


In [6]:
toshow = OrderedDict([
    ("H3K27ac", [("MACS2", "default"), ("ZINBRA", "tuned")]),
    ("H3K27me3", [("MACS2", "default"), ("SICER", "default"), ("ZINBRA", "tuned")]),
    ("H3K36me3", [("MACS2", "default"), ("SICER", "default"), ("ZINBRA", "tuned")]),
    ("H3K4me1", [("MACS2", "default"), ("ZINBRA", "tuned")]),
    ("H3K4me3", [("MACS2", "default"), ("ZINBRA", "tuned")])
])

result = pd.DataFrame(columns=['mark', 'tool', 'procedure', 
                               'peaks_avg', 'peaks_stdev', 
                               'sumlen_avg', 'sumlen_stdev',
                               'len_avg', 'len_stdev',
                               'frip_avg', 'frip_stdev'])


# Filter out dataframe to given combination of target, tool and procedure.
for m, tps in toshow.items():
    for (t, p) in tps:
        dfmtp = df.loc[np.logical_and(df['tool'] == t, 
                          np.logical_and(df['modification'] == m, 
                                         df['procedure'] == p))]
        peaks_avg = int(np.mean(dfmtp['peaks']))
        peaks_stdev = int(stdev(dfmtp['peaks']))
        sumlen_avg = int(np.mean(dfmtp['length'] / 10e6))
        sumlen_stdev = int(stdev(dfmtp['length'] / 10e6))
        l = dfmtp['length'] / dfmtp['peaks']
        l.loc[~np.isfinite(l)] = 0.0
        len_avg = int(np.mean(l))
        len_stdev = int(stdev(l))
        frip_avg = int(np.mean([f * 100 for f in dfmtp['frip'] if not np.isnan(f)]))
        frip_stdev = int(stdev([f * 100 for f in dfmtp['frip'] if not np.isnan(f)]))
        result.loc[len(result)] = (m, t, p, 
                                   peaks_avg, peaks_stdev, 
                                   sumlen_avg, sumlen_stdev,
                                   len_avg, len_stdev, 
                                   frip_avg, frip_stdev)

display(result)

Unnamed: 0,mark,tool,procedure,peaks_avg,peaks_stdev,sumlen_avg,sumlen_stdev,len_avg,len_stdev,frip_avg,frip_stdev
0,H3K27ac,MACS2,default,71253,26590,9,3,1257,176,31,14
1,H3K27ac,ZINBRA,tuned,23385,3709,3,0,1426,267,23,11
2,H3K27me3,MACS2,default,52654,26741,6,3,1187,236,16,8
3,H3K27me3,SICER,default,32639,8280,19,6,5890,1259,25,9
4,H3K27me3,ZINBRA,tuned,6710,600,4,0,6753,778,12,3
5,H3K36me3,MACS2,default,91365,23039,25,7,2821,911,45,14
6,H3K36me3,SICER,default,39948,8636,39,6,10138,2410,53,11
7,H3K36me3,ZINBRA,tuned,11618,1686,30,5,26715,4332,46,12
8,H3K4me1,MACS2,default,117774,26641,18,6,1485,454,42,17
9,H3K4me1,ZINBRA,tuned,63876,7679,15,1,2521,459,38,11


# Final talble for the paper

In [4]:
def parse(row):
    return int(row['peaks_avg']), int(row['peaks_stdev']),\
        int(row['sumlen_avg']), int(row['sumlen_stdev']),\
        int(row['len_avg']), int(row['len_stdev']),\
        float(row['frip_avg']), float(row['frip_stdev'])


table=pd.DataFrame(columns=['Modification', 'ZINBRA #peaks', 'ZINBRA #FRIP', 'ZINBRA sum peaks length (mbp)', 'ZINBRA peak length',
         'Golden tool #peaks', 'Golden tool #FRIP', 'Golden tool peaks length (mbp)', 'Golden tool peak length'])
for m in sorted(set(result['mark'])):
    mz = result.loc[result['mark']==m]
    # Processing ZINBRA     
    zp, zp_sd, zsl, zsl_sd, zl, zl_sd, zf, zf_sd = parse(mz.loc[mz['tool']=='ZINBRA'])
    # Processing golden tools
    trs = {}
    golden_tools = set(mz.loc[mz['tool'] != 'ZINBRA']['tool'])
    for t in golden_tools:
        trs[t] = parse(mz.loc[mz['tool']==t])

    tp = ' '.join(['{}±{}({})'.format(trs[t][0], trs[t][1], t) for t in golden_tools])
    tsl = ' '.join(['{}±{}({})'.format(trs[t][2], trs[t][3], t) for t in golden_tools])
    tl = ' '.join(['{}±{}({})'.format(trs[t][4], trs[t][5], t) for t in golden_tools])
    tf = ' '.join(['{}±{}({})'.format(trs[t][6], trs[t][7], t) for t in golden_tools])
    table.loc[len(table)] = (m, '{}±{}'.format(zp, zp_sd), '{}±{}'.format(zf, zf_sd), 
          '{}±{}'.format(zsl, zsl_sd), '{}±{}'.format(zl, zl_sd), 
          tp, tf, tsl, tl)
display(table)

Unnamed: 0,Modification,ZINBRA #peaks,ZINBRA #FRIP,ZINBRA sum peaks length (mbp),ZINBRA peak length,Golden tool #peaks,Golden tool #FRIP,Golden tool peaks length (mbp),Golden tool peak length
0,H3K27ac,23385±3709,23.0±11.0,3±0,1426±267,71253±26590(MACS2),31.0±14.0(MACS2),9±3(MACS2),1257±176(MACS2)
1,H3K27me3,6710±600,12.0±3.0,4±0,6753±778,32639±8280(SICER) 52654±26741(MACS2),25.0±9.0(SICER) 16.0±8.0(MACS2),19±6(SICER) 6±3(MACS2),5890±1259(SICER) 1187±236(MACS2)
2,H3K36me3,11618±1686,46.0±12.0,30±5,26715±4332,39948±8636(SICER) 91365±23039(MACS2),53.0±11.0(SICER) 45.0±14.0(MACS2),39±6(SICER) 25±7(MACS2),10138±2410(SICER) 2821±911(MACS2)
3,H3K4me1,63876±7679,38.0±11.0,15±1,2521±459,117774±26641(MACS2),42.0±17.0(MACS2),18±6(MACS2),1485±454(MACS2)
4,H3K4me3,15682±1741,8.0±2.0,3±1,1892±547,23389±8202(MACS2),8.0±4.0(MACS2),2±1(MACS2),956±194(MACS2)
