# Summary peak calling table processing

In order to create Peak Calling Summary table, collect all the data using the following script.
```
# Y20O20
run PeakCallerTuning experiments/src/main/resources/datasets/Y20O20.yaml -zsb

# ENCODE
run PeakCallerTuning experiments/src/main/resources/datasets/cd14encode.yaml -zsb
```

In [14]:
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
import numpy as np
from statistics import stdev
from collections import OrderedDict, Counter


df = pd.read_csv('/mnt/stripe/bio/experiments/configs/Y20O20/benchmark/Y20O20_peaks_summary_uli.tsv', 
                 sep='\t', comment='#')
# Ignore Failed tracks
df = df.loc[df['status'] != 'failed']
display(df.head(3))

Unnamed: 0,donor,modification,tool,peaks,length,frip,procedure,params,file,status
1,OD7,H3K27ac,SPAN,24849,31939200,0.329166,tuned,200_1.0E-12_5,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,ok
2,OD1,H3K27ac,SPAN,24760,37517000,0.182379,tuned,200_1.0E-4_2,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,ok
3,OD2,H3K27ac,SPAN,20260,27315400,0.222901,tuned,200_1.0E-10_5,/mnt/stripe/bio/experiments/configs/Y20O20/ben...,ok


In [15]:
toshow = OrderedDict([
    ("H3K27ac", [("MACS2", "default"), ("SPAN", "tuned")]),
    ("H3K27me3", [("MACS2", "default"), ("SICER", "default"), ("SPAN", "tuned")]),
    ("H3K36me3", [("MACS2", "default"), ("SICER", "default"), ("SPAN", "tuned")]),
    ("H3K4me1", [("MACS2", "default"), ("SPAN", "tuned")]),
    ("H3K4me3", [("MACS2", "default"), ("SPAN", "tuned")])
])

result = pd.DataFrame(columns=['mark', 'tool', 'procedure', 
                               'peaks_avg', 'peaks_stdev', 
                               'sumlen_avg', 'sumlen_stdev',
                               'len_avg', 'len_stdev',
                               'frip_avg', 'frip_stdev'])


# Filter out dataframe to given combination of target, tool and procedure.
for m, tps in toshow.items():
    for (t, p) in tps:
        dfmtp = df.loc[np.logical_and(df['tool'] == t, 
                          np.logical_and(df['modification'] == m, 
                                         df['procedure'] == p))]
        peaks_avg = int(np.mean(dfmtp['peaks']))
        peaks_stdev = int(stdev(dfmtp['peaks']))
        sumlen_avg = int(np.mean(dfmtp['length'] / 10e6))
        sumlen_stdev = int(stdev(dfmtp['length'] / 10e6))
        l = dfmtp['length'] / dfmtp['peaks']
        l.loc[~np.isfinite(l)] = 0.0
        len_avg = int(np.mean(l))
        len_stdev = int(stdev(l))
        frip_avg = int(np.mean([f * 100 for f in dfmtp['frip'] if not np.isnan(f)]))
        frip_stdev = int(stdev([f * 100 for f in dfmtp['frip'] if not np.isnan(f)]))
        result.loc[len(result)] = (m, t, p, 
                                   peaks_avg, peaks_stdev, 
                                   sumlen_avg, sumlen_stdev,
                                   len_avg, len_stdev, 
                                   frip_avg, frip_stdev)

display(result)

Unnamed: 0,mark,tool,procedure,peaks_avg,peaks_stdev,sumlen_avg,sumlen_stdev,len_avg,len_stdev,frip_avg,frip_stdev
0,H3K27ac,MACS2,default,26161,8841,3,1,1251,164,23,14
1,H3K27ac,SPAN,tuned,23385,3709,3,0,1426,267,23,11
2,H3K27me3,MACS2,default,11370,5310,2,1,1605,611,9,6
3,H3K27me3,SICER,default,19144,6595,14,6,7421,1194,21,9
4,H3K27me3,SPAN,tuned,6710,600,4,0,6753,778,12,3
5,H3K36me3,MACS2,default,49556,15804,10,4,1939,717,25,12
6,H3K36me3,SICER,default,31830,5082,36,5,11606,2461,51,11
7,H3K36me3,SPAN,tuned,11618,1686,30,5,26715,4332,46,12
8,H3K4me1,MACS2,default,71650,26671,9,4,1176,344,28,15
9,H3K4me1,SPAN,tuned,63876,7679,15,1,2521,459,38,11


# Final talble for the paper

In [17]:
def parse(row):
    return int(row['peaks_avg']), int(row['peaks_stdev']),\
        int(row['sumlen_avg']), int(row['sumlen_stdev']),\
        int(row['len_avg']), int(row['len_stdev']),\
        float(row['frip_avg']), float(row['frip_stdev'])


table=pd.DataFrame(columns=['Modification', 'SPAN #peaks', 'SPAN #FRIP', 'SPAN sum peaks length (mbp)', 'SPAN peak length',
         'Golden tool #peaks', 'Golden tool #FRIP', 'Golden tool peaks length (mbp)', 'Golden tool peak length'])
for m in sorted(set(result['mark'])):
    mz = result.loc[result['mark']==m]
    # Processing SPAN     
    zp, zp_sd, zsl, zsl_sd, zl, zl_sd, zf, zf_sd = parse(mz.loc[mz['tool']=='SPAN'])
    # Processing golden tools
    trs = {}
    golden_tools = set(mz.loc[mz['tool'] != 'SPAN']['tool'])
    for t in golden_tools:
        trs[t] = parse(mz.loc[mz['tool']==t])

    tp = ' '.join(['{}±{}({})'.format(trs[t][0], trs[t][1], t) for t in golden_tools])
    tsl = ' '.join(['{}±{}({})'.format(trs[t][2], trs[t][3], t) for t in golden_tools])
    tl = ' '.join(['{}±{}({})'.format(trs[t][4], trs[t][5], t) for t in golden_tools])
    tf = ' '.join(['{}±{}({})'.format(trs[t][6], trs[t][7], t) for t in golden_tools])
    table.loc[len(table)] = (m, '{}±{}'.format(zp, zp_sd), '{}±{}'.format(zf, zf_sd), 
          '{}±{}'.format(zsl, zsl_sd), '{}±{}'.format(zl, zl_sd), 
          tp, tf, tsl, tl)
display(table)
table.to_csv('/mnt/stripe/figures/peaks_table.tsv', sep='\t')

Unnamed: 0,Modification,SPAN #peaks,SPAN #FRIP,SPAN sum peaks length (mbp),SPAN peak length,Golden tool #peaks,Golden tool #FRIP,Golden tool peaks length (mbp),Golden tool peak length
0,H3K27ac,23385±3709,23.0±11.0,3±0,1426±267,26161±8841(MACS2),23.0±14.0(MACS2),3±1(MACS2),1251±164(MACS2)
1,H3K27me3,6710±600,12.0±3.0,4±0,6753±778,11370±5310(MACS2) 19144±6595(SICER),9.0±6.0(MACS2) 21.0±9.0(SICER),2±1(MACS2) 14±6(SICER),1605±611(MACS2) 7421±1194(SICER)
2,H3K36me3,11618±1686,46.0±12.0,30±5,26715±4332,49556±15804(MACS2) 31830±5082(SICER),25.0±12.0(MACS2) 51.0±11.0(SICER),10±4(MACS2) 36±5(SICER),1939±717(MACS2) 11606±2461(SICER)
3,H3K4me1,63876±7679,38.0±11.0,15±1,2521±459,71650±26671(MACS2),28.0±15.0(MACS2),9±4(MACS2),1176±344(MACS2)
4,H3K4me3,15682±1741,8.0±2.0,3±1,1892±547,11035±4630(MACS2),4.0±2.0(MACS2),1±0(MACS2),834±237(MACS2)
