# Summary peak calling table processing

In order to create Peak Calling Summary table, collect all the data using the following script.
```
# Aging defaults
cd /mnt/stripe/bio/experiments/aging/peak_calling

# Aging tuned
cd /mnt/stripe/bio/experiments/configs/Y20O20/benchmark

# ENCODE defaults
cd /mnt/stripe/bio/experiments/configs/benchmark_encode/defaults

# ENCODE tuned
cd /mnt/stripe/bio/experiments/configs/benchmark_encode/tuned

TAB=$'\t'; 
for M in H3K27ac H3K27me3 H3K36me3 H3K4me1 H3K4me3; do  
    for TOOL in macs_broad sicer zinbra; do  
        if [[ -d ${M}/${TOOL} ]]; then  
            for F in $(find $(pwd)/${M}/${TOOL} -maxdepth 1 | grep -E 'peaks\.bed$|island\.bed$|broadPeak$|narrowPeak$'); do  
                DONOR=$(echo "$F" | sed 's#.*/.D_##g' | sed 's#.*CD14_##g' | sed 's#.*broad/##g'| sed 's#.*sicer/##g' | sed 's#_.*##g' );  
                N=$(wc -l $F | sed 's# .*##g');  
                L=$(awk 'BEGIN{L=0} {L+=($3-$2)} END{print L}' $F); 
                RIPL=$(cat ${F}_rip.csv | grep 'length'); 
                # Pick correct RIP number
                if [[ -z $RIPL ]]; then 
                    FRIP=$(cat ${F}_rip.csv | tail -n 1 | awk -v FS=',' '{print($5 * 100 / $3)}'); else FRIP=$(cat ${F}_rip.csv | tail -n 1 | awk -v FS=',' '{print($6 * 100 / $3)}'); 
                fi; 
                echo "$DONOR$TAB$M$TAB$TOOL$TAB$N$TAB$L$TAB$FRIP$TAB$F"; 
            done; 
        fi; 
    done; 
done
```

In [54]:
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd
import numpy as np
from statistics import stdev
from collections import OrderedDict, Counter


df = pd.read_csv('/mnt/stripe/bio/experiments/figures/Peak Calling Summary - aging.tsv', sep='\t')
# Ignore Failed tracks
df = df.loc[df['Status'] != 'Failed']

toshow = OrderedDict([
    ("H3K27ac", [("MACS2", "default"), ("ZINBRA", "tuned")]),
    ("H3K27me3", [("MACS2", "default"), ("SICER", "default"), ("ZINBRA", "tuned")]),
    ("H3K36me3", [("MACS2", "default"), ("SICER", "default"), ("ZINBRA", "tuned")]),
    ("H3K4me1", [("MACS2", "default"), ("ZINBRA", "tuned")]),
    ("H3K4me3", [("MACS2", "default"), ("ZINBRA", "tuned")])
])

result = pd.DataFrame(columns=['mark', 'tool', 'procedure', 
                               'peaks_avg', 'peaks_stdev', 
                               'sumlen_avg', 'sumlen_stdev',
                               'len_avg', 'len_stdev',
                               'frip_avg', 'frip_stdev'])


# Filter out dataframe to given combination of target, tool and procedure.
for m, tps in toshow.items():
    for (t, p) in tps:
        dfmtp = df.loc[np.logical_and(df['tool'] == t, 
                          np.logical_and(df['modification'] == m, 
                                         df['procedure'] == p))]
        peaks_avg = int(np.mean(dfmtp['peaks']))
        peaks_stdev = int(stdev(dfmtp['peaks']))
        sumlen_avg = int(np.mean(dfmtp['length'] / 10e6))
        sumlen_stdev = int(stdev(dfmtp['length'] / 10e6))
        l = dfmtp['length'] / dfmtp['peaks']
        l.loc[~np.isfinite(l)] = 0.0
        len_avg = int(np.mean(l))
        len_stdev = int(stdev(l))
        frip_avg = int(np.mean(dfmtp['FRiP']))
        frip_stdev = int(stdev(dfmtp['FRiP']))
        result.loc[len(result)] = (m, t, p, 
                                   peaks_avg, peaks_stdev, 
                                   sumlen_avg, sumlen_stdev,
                                  len_avg, len_stdev, 
                                  frip_avg, frip_stdev)

display(result)

Unnamed: 0,mark,tool,procedure,peaks_avg,peaks_stdev,sumlen_avg,sumlen_stdev,len_avg,len_stdev,frip_avg,frip_stdev
0,H3K27ac,MACS2,default,26161.0,8841.0,3.0,1.0,1251.0,164.0,23.0,14.0
1,H3K27ac,ZINBRA,tuned,23385.0,3709.0,3.0,0.0,1426.0,267.0,23.0,11.0
2,H3K27me3,MACS2,default,7517.0,3498.0,1.0,1.0,2108.0,780.0,8.0,5.0
3,H3K27me3,SICER,default,19144.0,6595.0,14.0,6.0,7421.0,1194.0,21.0,9.0
4,H3K27me3,ZINBRA,tuned,6710.0,600.0,4.0,0.0,6753.0,778.0,12.0,3.0
5,H3K36me3,MACS2,default,49556.0,15804.0,10.0,4.0,1939.0,717.0,27.0,13.0
6,H3K36me3,SICER,default,20916.0,3427.0,42.0,6.0,20670.0,4803.0,54.0,11.0
7,H3K36me3,ZINBRA,tuned,10712.0,2148.0,26.0,5.0,25440.0,5008.0,41.0,11.0
8,H3K4me1,MACS2,default,68013.0,23518.0,10.0,4.0,1494.0,307.0,33.0,17.0
9,H3K4me1,ZINBRA,tuned,63876.0,7679.0,15.0,1.0,2521.0,459.0,38.0,11.0


# Final talble for the paper

In [55]:
def parse(row):
    return int(row['peaks_avg']), int(row['peaks_stdev']),\
        int(row['sumlen_avg']), int(row['sumlen_stdev']),\
        int(row['len_avg']), int(row['len_stdev']),\
        float(row['frip_avg']), float(row['frip_stdev'])


table=pd.DataFrame(columns=['Modification', 'ZINBRA #peaks', 'ZINBRA #FRIP', 'ZINBRA sum peaks length (mbp)', 'ZINBRA peak length',
         'Golden tool #peaks', 'Golden tool #FRIP', 'Golden tool peaks length (mbp)', 'Golden tool peak length'])
for m in sorted(set(result['mark'])):
    mz = result.loc[result['mark']==m]
    # Processing ZINBRA     
    zp, zp_sd, zsl, zsl_sd, zl, zl_sd, zf, zf_sd = parse(mz.loc[mz['tool']=='ZINBRA'])
    # Processing golden tools
    trs = {}
    golden_tools = set(mz.loc[mz['tool'] != 'ZINBRA']['tool'])
    for t in golden_tools:
        trs[t] = parse(mz.loc[mz['tool']==t])

    tp = ' '.join(['{}±{}({})'.format(trs[t][0], trs[t][1], t) for t in golden_tools])
    tsl = ' '.join(['{}±{}({})'.format(trs[t][2], trs[t][3], t) for t in golden_tools])
    tl = ' '.join(['{}±{}({})'.format(trs[t][4], trs[t][5], t) for t in golden_tools])
    tf = ' '.join(['{}±{}({})'.format(trs[t][6], trs[t][7], t) for t in golden_tools])
    table.loc[len(table)] = (m, '{}±{}'.format(zp, zp_sd), '{}±{}'.format(zf, zf_sd), 
          '{}±{}'.format(zsl, zsl_sd), '{}±{}'.format(zl, zl_sd), 
          tp, tf, tsl, tl)
display(table)

Unnamed: 0,Modification,ZINBRA #peaks,ZINBRA #FRIP,ZINBRA sum peaks length (mbp),ZINBRA peak length,Golden tool #peaks,Golden tool #FRIP,Golden tool peaks length (mbp),Golden tool peak length
0,H3K27ac,23385±3709,23.0±11.0,3±0,1426±267,26161±8841(MACS2),23.0±14.0(MACS2),3±1(MACS2),1251±164(MACS2)
1,H3K27me3,6710±600,12.0±3.0,4±0,6753±778,19144±6595(SICER) 7517±3498(MACS2),21.0±9.0(SICER) 8.0±5.0(MACS2),14±6(SICER) 1±1(MACS2),7421±1194(SICER) 2108±780(MACS2)
2,H3K36me3,10712±2148,41.0±11.0,26±5,25440±5008,20916±3427(SICER) 49556±15804(MACS2),54.0±11.0(SICER) 27.0±13.0(MACS2),42±6(SICER) 10±4(MACS2),20670±4803(SICER) 1939±717(MACS2)
3,H3K4me1,63876±7679,38.0±11.0,15±1,2521±459,68013±23518(MACS2),33.0±17.0(MACS2),10±4(MACS2),1494±307(MACS2)
4,H3K4me3,15682±1741,8.0±2.0,3±1,1892±547,11035±4630(MACS2),5.0±3.0(MACS2),1±0(MACS2),834±237(MACS2)
