# MoCCA-SV benchmark notebook
The notebook is to carry on MoCCA-SV benchmark. The input files and options are all specified in the following cell.  The output of this benchmark is a pandas data frame, including the performance metrics, such as TP, FP, F1, etc.  The performance will be subsetted by SV types.

---
## Initial setting

In [1]:
%cd /mnt/nfs/gigantor/ifs/DCEG/Projects/CoherentLogic/SV/mocca-bench
! module list

/mnt/nfs/gigantor/ifs/DCEG/Projects/CoherentLogic/SV/mocca-bench
Currently Loaded Modulefiles:
  1) singularity/3.0.1   3) python3/3.6.3       5) bedtools/2.27.1
  2) tmux/2.5            4) gcc/7.2.0           6) zlib/1.2.8


---
## Configuration

In [2]:
TRUTH_BED="data/my-tp-base.bed" 
COMP_BED="data/intrachromosomal_SVs_AJ_2x250_son"
INCLUDED_BED="data/HG002_SVs_Tier1_noVDJorXorY_v0.6.2.bed" # Assign it to "", if there is not included be file
# INCLUDED_BED=""
# Name TMP uniquely if to run multiple benchmark at the same time
TMP="./TMP" #  temporary file is to be used for keep temporary intermediate results, will be overwrriten for each run

# deffine some constants in python 
N=500 # for padding intersion
MIN_OVP=0.7 # for minimum reciprocal overlap
MAX_INS_GAP=20 # assume that breakpoint gap of the insert is no more than 20

genome_file = "./data/hg19_wo_chr.genome" 
#genome_file = "./data/hg19_wi_chr.genome 

# [:memo:] We svaba, breakdancer and manta results in the order in the MoCCA-SV output.
# "#CHROM	start	end	svaba	breakdancer	manta	caller_count"	
columns=[10,11,12]
callers = ['svaba', 'breakdancer', 'manta']
TRUTH_BED_COLS=6 # the turth bed file contains 6 columns

# Bari would like to reduce FP and increase the sensitivity
# So we focus on "OR" not "AND"
ensemble={"svaba_or_manta":[10,12],
          "svaba_or_breakdancer":[10,11],
          "manta_or_breakdancer":[12,11],
          "any_one":[11,12,13]
          }

OUTFN="GIAB_performance.csv"

In [3]:
import os
import subprocess 
import pandas as pd

def run_shell(cmd, get_single_number=False):
    print("Run: "+cmd)
    try:
       grepOut = subprocess.check_output(cmd, shell=True, universal_newlines=True, stderr=subprocess.STDOUT)                       
    except subprocess.CalledProcessError as grepexc:                                                                                                   
        print("Error code", grepexc.returncode,'; ', grepexc.output)
    
    if get_single_number:
        grepOut=int(grepOut)
        
    return grepOut



---
## Data process

### 1. Intersect with the included region if availalbe

In [4]:
truth_bed1=TRUTH_BED
comp_bed1=COMP_BED

if INCLUDED_BED is None or INCLUDED_BED=="":
    print("There is no included region specified!")
else:
    # print ("do something")
    truth_bed1 = os.path.join(TMP, "truth_included.bed")
    comp_bed1 = os.path.join(TMP, "comp_included.bed")
    os.system("mkdir -p %s" % (TMP))
    cmd1="bedtools intersect -a %s -b %s -f 1.0000000 -u > %s" % (TRUTH_BED, INCLUDED_BED, truth_bed1)
    cmd2="bedtools intersect -a %s -b %s -f 1.0000000 -u > %s" % (COMP_BED, INCLUDED_BED, comp_bed1)
    _ = run_shell(cmd1)
    run_shell(cmd2)
    

Run: bedtools intersect -a data/my-tp-base.bed -b data/HG002_SVs_Tier1_noVDJorXorY_v0.6.2.bed -f 1.0000000 -u > ./TMP/truth_included.bed
Run: bedtools intersect -a data/intrachromosomal_SVs_AJ_2x250_son -b data/HG002_SVs_Tier1_noVDJorXorY_v0.6.2.bed -f 1.0000000 -u > ./TMP/comp_included.bed


### 2. Padding the insertions

In [5]:
truth_bed2=os.path.join(TMP, "truth_padded.bed")
comp_bed2=os.path.join(TMP, "comp_padded.bed")

ins_bed=os.path.join(TMP, "ins_padded.bed")
oth_bed=os.path.join(TMP, "others_padded.bed") 
cmd = "bedtools slop -g %s  -b %s "; 
    
pad_truth_cmd = """ 
 awk -v OFS='\\t' '{ 
    if($5 == "INS") 
        print $0 > "%s"; 
    else 
        print $0 > "%s";
    }' %s > /dev/null ; 
  cat %s >%s ; 
  bedtools slop -i %s -g %s  -b %s >> %s ;
""" % (ins_bed, oth_bed, truth_bed1, oth_bed, truth_bed2, ins_bed, genome_file, N, truth_bed2)
        
_= run_shell(pad_truth_cmd)

pad_comp_cmd = """ 
 awk -v OFS='\\t' '{ 
    if($3-$2 <= %d) 
        print $0 > "%s"; 
    else 
        print $0 > "%s";
    }' %s > /dev/null; 
  cat %s > %s;  
  bedtools slop -i %s -g %s  -b %s >> %s;
""" % (MAX_INS_GAP, ins_bed, oth_bed, comp_bed1, oth_bed, comp_bed2, ins_bed, genome_file, N, comp_bed2)
        
_= run_shell(pad_comp_cmd)


# pad_comp_cmd = """ 
#  awk -v OFS='\\t' '{ 
#     cmd = "bedtools slop -g %s  -b %d "; 
#     if($3-$2 <= %d) 
#         print $0 | cmd; 
#     else 
#         print $0 
#     }' %s > %s 
# """ % (genome_file, N, MAX_INS_GAP, comp_bed1, comp_bed2)
        
# _= run_shell(pad_comp_cmd)


Run:  
 awk -v OFS='\t' '{ 
    if($5 == "INS") 
        print $0 > "./TMP/ins_padded.bed"; 
    else 
        print $0 > "./TMP/others_padded.bed";
    }' ./TMP/truth_included.bed > /dev/null ; 
  cat ./TMP/others_padded.bed >./TMP/truth_padded.bed ; 
  bedtools slop -i ./TMP/ins_padded.bed -g ./data/hg19_wo_chr.genome  -b 500 >> ./TMP/truth_padded.bed ;

Run:  
 awk -v OFS='\t' '{ 
    if($3-$2 <= 20) 
        print $0 > "./TMP/ins_padded.bed"; 
    else 
        print $0 > "./TMP/others_padded.bed";
    }' ./TMP/comp_included.bed > /dev/null; 
  cat ./TMP/others_padded.bed > ./TMP/comp_padded.bed;  
  bedtools slop -i ./TMP/ins_padded.bed -g ./data/hg19_wo_chr.genome  -b 500 >> ./TMP/comp_padded.bed;



### 3. Get the intersections between the truth bed and the compute bed 

In [6]:
isec_bed = os.path.join(TMP, "isec.bed")
isec_cmd = """
             bedtools intersect -a %s -b %s -wa -wb -f %f -r > %s
           """ % (truth_bed2, comp_bed2, MIN_OVP, isec_bed)

_=run_shell(isec_cmd)

Run: 
             bedtools intersect -a ./TMP/truth_padded.bed -b ./TMP/comp_padded.bed -wa -wb -f 0.700000 -r > ./TMP/isec.bed
           


### 4. Summary the results
	

In [7]:
rv=[]
for i in columns:
    cmd = """
            awk  '{if($%d=="orig") print $4}' %s | sort -u |wc -l
        """ % (i, isec_bed)
    rv.append(run_shell(cmd))
    
tps = list(map(lambda x: int(x), rv))
print(tps)

Run: 
            awk  '{if($10=="orig") print $4}' ./TMP/isec.bed | sort -u |wc -l
        
Run: 
            awk  '{if($11=="orig") print $4}' ./TMP/isec.bed | sort -u |wc -l
        
Run: 
            awk  '{if($12=="orig") print $4}' ./TMP/isec.bed | sort -u |wc -l
        
[930, 1205, 4475]


In [8]:
# get pred
rv=[]
for i in columns:
    cmd = """
            awk  '{if($%d=="orig") print}' %s | wc -l
        """ % (i-TRUTH_BED_COLS, comp_bed1)
    rv.append(run_shell(cmd, get_single_number=True))
    
preds = rv
print(preds)

Run: 
            awk  '{if($4=="orig") print}' ./TMP/comp_included.bed | wc -l
        
Run: 
            awk  '{if($5=="orig") print}' ./TMP/comp_included.bed | wc -l
        
Run: 
            awk  '{if($6=="orig") print}' ./TMP/comp_included.bed | wc -l
        
[4168, 18254, 5353]


In [9]:
# work on the ensemble call 
# get prediction count using bedtools cluster

comp_cluster = os.path.join(TMP, "comp_cluster.bed")

_ = run_shell("bedtools cluster -i %s > %s" % (comp_bed1, comp_cluster))

cluster_last_col = run_shell("awk 'NR==1{print NF}' %s" %(comp_cluster), get_single_number=True)              
print("last column pos: ", cluster_last_col)  

all_callers=callers

for k in ensemble.keys():
    print(k)
    
    tp_tmp_file=os.path.join(TMP, "%s_tp.tmp" % k)
    pred_tmp_file=os.path.join(TMP, "%s_pred.tmp" % k)
    
    if os.path.exists(tp_tmp_file):
        os.remove(tp_tmp_file)
        
    if os.path.exists(pred_tmp_file):
        os.remove(pred_tmp_file)
        
    for i in ensemble[k]:
        # count tp
        cmd= """
                awk   '{if ( $%d=="orig" ) print $4}' %s >> %s
             """ % (i, isec_bed, tp_tmp_file)
        _=run_shell(cmd)
        
        cmd2="""
                awk   '{if ( $%d=="orig" ) print $%d}' %s >> %s
             """ % (i-TRUTH_BED_COLS, cluster_last_col, comp_cluster, pred_tmp_file)
        _=run_shell(cmd2)
        
    tp=run_shell("sort -u %s |wc -l" % (tp_tmp_file), get_single_number=True)
    pred=run_shell("sort -u %s |wc -l" % (pred_tmp_file), get_single_number=True)
    print(tp)
    print(pred)
    
    all_callers.append(k)
    tps.append(tp)
    preds.append(pred)



Run: bedtools cluster -i ./TMP/comp_included.bed > ./TMP/comp_cluster.bed
Run: awk 'NR==1{print NF}' ./TMP/comp_cluster.bed
last column pos:  27
svaba_or_manta
Run: 
                awk   '{if ( $10=="orig" ) print $4}' ./TMP/isec.bed >> ./TMP/svaba_or_manta_tp.tmp
             
Run: 
                awk   '{if ( $4=="orig" ) print $27}' ./TMP/comp_cluster.bed >> ./TMP/svaba_or_manta_pred.tmp
             
Run: 
                awk   '{if ( $12=="orig" ) print $4}' ./TMP/isec.bed >> ./TMP/svaba_or_manta_tp.tmp
             
Run: 
                awk   '{if ( $6=="orig" ) print $27}' ./TMP/comp_cluster.bed >> ./TMP/svaba_or_manta_pred.tmp
             
Run: sort -u ./TMP/svaba_or_manta_tp.tmp |wc -l
Run: sort -u ./TMP/svaba_or_manta_pred.tmp |wc -l
4590
7000
svaba_or_breakdancer
Run: 
                awk   '{if ( $10=="orig" ) print $4}' ./TMP/isec.bed >> ./TMP/svaba_or_breakdancer_tp.tmp
             
Run: 
                awk   '{if ( $4=="orig" ) print $27}' ./TMP/comp_cluster.bed >>

In [10]:
truth_cnt = run_shell("cat %s | wc -l" % (TRUTH_BED), get_single_number=True)
df = pd.DataFrame({"Truth":truth_cnt, "TP":tps, 
                   "PRED":preds}, index=all_callers).assign(
                Sp=lambda df: df.TP/df.PRED, 
                Sn=lambda df: df.TP/df.Truth,
                FN=lambda df:1-df.Sn )

df.index.name="Caller" 
print(df)

Run: cat data/my-tp-base.bed | wc -l
                      Truth    TP   PRED        Sp        Sn        FN
Caller                                                                
svaba                  9361   930   4168  0.223129  0.099348  0.900652
breakdancer            9361  1205  18254  0.066013  0.128726  0.871274
manta                  9361  4475   5353  0.835980  0.478047  0.521953
svaba_or_manta         9361  4590   7000  0.655714  0.490332  0.509668
svaba_or_breakdancer   9361  1458  19158  0.076104  0.155753  0.844247
manta_or_breakdancer   9361  4584  20994  0.218348  0.489691  0.510309
any_one                9361  4584  20994  0.218348  0.489691  0.510309


In [11]:
df.to_csv(OUTFN)