In [1]:
import sys

import pandas as pd
import numpy as np

sys.path.insert(0, '../scripts')
from flaHMM_functions import *

In [2]:
NX_all=pd.read_csv('../data/NX_stats.csv', index_col=0)

In [3]:
test = ['Dbia.GCF_018148935','Dbia.d101g','Dbia.d15genomes',
                       'Dere.GCF_003286155','Dere.d101g','Dere.droEre1','Dere.d15genomes',
                       'Dsuz.GCF_013340165',
                       'Dtei.GCF_016746235','Dtei.d101g_2733','Dtei.d101g_CT02']

In [4]:
test2 = ['Dfic.GCF_018152265','Dfic.d101g','Dfic.GCF_000220665',
                       'Dosh.d101g',
                       'Dath.GCA_008121215',
                       'Dazt.GCA_005876895',
                       'Dmir.GCF_003369915',
                       'Dper.GCF_003286085','Dper.d101g','Dper.d15genomes',
                       'Dpse.d15genomes','Dpse.GCF_009870125',
                       'Dinn.GCF_004354385',
                       'Damb.d101g',
                       'Dbif.GCA_009664405',
                       'Dobs.d101g','Dobs.GCF_018151105',
                       'Dtris.d101g']

## Define helper function

In [5]:
def get_test_results(X_test,species_test,NX_value,threshold):
    X_test = X_test[X_test['species_test']==species_test]
    
    #======================Use genome assembly statistics to filter for small chrs  N50===============================
    threshold_chr=int(NX_all[(NX_all['NG (X)']==NX_value)&(NX_all['Species']==species_test)]['scaffold length (bp)'].values[0])
    chr_size=X_test.groupby('chr').max(numeric_only=True)[['bin_end']]
    
    keep_chrs=chr_size[chr_size['bin_end']>=threshold_chr]
    X_test_filtered_NX=X_test[X_test['chr'].isin(keep_chrs.index.tolist())]

    return(np.array(X_test["region_binary"]),np.array(X_test['pred_'+str(threshold)]),np.array(X_test_filtered_NX["region_binary"]),np.array(X_test_filtered_NX['pred_'+str(threshold)]))

## Retrieve predictions for all flam-syntenic regions

In [6]:
threshold = 0.075
bin_size = 5
X_test_all = pd.read_csv('results_combinations/ext/X_test_all_extendedList_Bin_'+str(bin_size)+'k_threshold_'+str(threshold)+'.txt', sep='\t')

In [7]:
# Run for all species in flam-syntenic test set

NX_threshold = 90

for species in test:
    X_true, X_pred, X_true_NX, X_pred_NX = get_test_results(X_test_all,species,NX_threshold,threshold)
    
    TP = np.array((X_true==1)&(X_pred==1)).sum()
    FP = np.array((X_true!=1)&(X_pred==1)).sum()
    FN = np.array((X_true==1)&(X_pred!=1)).sum()
    TN = np.array((X_true!=1)&(X_pred!=1)).sum()
    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)

    #print("%s\tTP=%d FP=%d FN=%d TN=%d\tTPR=%.4f\tFPR=%.4f" % (species,TP,FP,FN,TN,TPR,FPR))
    #print("%s\t%d %d %d %d\t%.4f\t%.4f" % (species,TP,FP,FN,TN,TPR,FPR))
    
    TP_NX = np.array((X_true_NX==1)&(X_pred_NX==1)).sum()
    FP_NX = np.array((X_true_NX!=1)&(X_pred_NX==1)).sum()
    FN_NX = np.array((X_true_NX==1)&(X_pred_NX!=1)).sum()
    TN_NX = np.array((X_true_NX!=1)&(X_pred_NX!=1)).sum()
    TPR_NX = TP_NX/(TP_NX+FN_NX)
    FPR_NX = FP_NX/(FP_NX+TN_NX)

    #print("%s\tTP=%d FP=%d FN=%d TN=%d\tTPR=%.4f\tFPR=%.4f [NX threshold: %d]" % (species,TP_NX,FP_NX,FN_NX,TN_NX,TPR_NX,FPR_NX,NX_threshold))
    print("%s\t%d %d %d %d\t%.4f\t%.4f" % (species,TP_NX,FP_NX,FN_NX,TN_NX,TPR_NX,FPR_NX))

Dbia.GCF_018148935	45 780 1 66208	0.9783	0.0116
Dbia.d101g	45 854 1 66134	0.9783	0.0127
Dbia.d15genomes	38 612 2 65280	0.9500	0.0093
Dere.GCF_003286155	42 355 14 52595	0.7500	0.0067
Dere.d101g	35 8 3 49164	0.9211	0.0002
Dere.droEre1	35 46 17 55096	0.6731	0.0008
Dere.d15genomes	42 25 14 47453	0.7500	0.0005
Dsuz.GCF_013340165	137 4331 0 92172	1.0000	0.0449
Dtei.GCF_016746235	155 0 0 53951	1.0000	0.0000
Dtei.d101g_2733	36 1040 1 53901	0.9730	0.0189
Dtei.d101g_CT02	42 403 2 51725	0.9545	0.0077


## Retrieve predictions for all flam-like regions

In [8]:
# Run for all species in external test set

NX_threshold = 90

for species in test2:
    X_true, X_pred, X_true_NX, X_pred_NX = get_test_results(X_test_all,species,NX_threshold,threshold)
    
    TP = np.array((X_true==1)&(X_pred==1)).sum()
    FP = np.array((X_true!=1)&(X_pred==1)).sum()
    FN = np.array((X_true==1)&(X_pred!=1)).sum()
    TN = np.array((X_true!=1)&(X_pred!=1)).sum()
    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)

    #print("%s\tTP=%d FP=%d FN=%d TN=%d\tTPR=%.4f\tFPR=%.4f" % (species,TP,FP,FN,TN,TPR,FPR))
    #print("%s\t%d %d %d %d\t%.4f\t%.4f" % (species,TP,FP,FN,TN,TPR,FPR))
    
    TP_NX = np.array((X_true_NX==1)&(X_pred_NX==1)).sum()
    FP_NX = np.array((X_true_NX!=1)&(X_pred_NX==1)).sum()
    FN_NX = np.array((X_true_NX==1)&(X_pred_NX!=1)).sum()
    TN_NX = np.array((X_true_NX!=1)&(X_pred_NX!=1)).sum()
    TPR_NX = TP_NX/(TP_NX+FN_NX)
    FPR_NX = FP_NX/(FP_NX+TN_NX)

    #print("%s\tTP=%d FP=%d FN=%d TN=%d\tTPR=%.4f\tFPR=%.4f [NX threshold: %d]" % (species,TP_NX,FP_NX,FN_NX,TN_NX,TPR_NX,FPR_NX,NX_threshold))
    print("%s\t%d %d %d %d\t%.4f\t%.4f" % (species,TP_NX,FP_NX,FN_NX,TN_NX,TPR_NX,FPR_NX))

Dfic.GCF_018152265	101 509 11 59861	0.9018	0.0084
Dfic.d101g	101 509 15 59857	0.8707	0.0084
Dfic.GCF_000220665	0 101 16 55487	0.0000	0.0018
Dosh.d101g	108 261 0 64921	1.0000	0.0040
Dath.GCA_008121215	157 142 1 70570	0.9937	0.0020
Dazt.GCA_005876895	73 2840 0 76097	1.0000	0.0360
Dmir.GCF_003369915	87 7477 4 97138	0.9560	0.0715
Dper.GCF_003286085	50 3053 0 67457	1.0000	0.0433
Dper.d101g	25 510 0 54597	1.0000	0.0093
Dper.d15genomes	7 1194 0 57987	1.0000	0.0202
Dpse.d15genomes	48 1339 1 56016	0.9796	0.0233
Dpse.GCF_009870125	57 2644 7 59216	0.8906	0.0427
Dinn.GCF_004354385	49 110 0 62321	1.0000	0.0018
Damb.d101g	0 99 0 58405	nan	0.0017
Dbif.GCA_009664405	19 263 6 72962	0.7600	0.0036


  TPR_NX = TP_NX/(TP_NX+FN_NX)


Dobs.d101g	12 647 2 64249	0.8571	0.0100
Dobs.GCF_018151105	12 840 0 64058	1.0000	0.0129
Dtris.d101g	12 28 1 57145	0.9231	0.0005
