In [9]:
import os
import sys
import pandas as pd
import json

score_dir = os.path.abspath('../scoring_program/')
sys.path.append(score_dir)

from score_combined import *

def cmp_results(score_type, pred_dict, gt_dict):
    WIDTH = 18
    print(f'====== compare {score_type} {"=" * (WIDTH - len(score_type))}')
    for k,v in gt_dict[score_type].items():
        pred_value = pred_dict[k]
        if pred_value == v:
            print(f'{k} {" " * (WIDTH - len(k))} matched.')
        else:
            print(f'{k} {" " * (WIDTH - len(k))} should be {v}, but get {pred_value}.')
    print('==================================\n')

def validate(root_dir, score_type='both'):
    prediction_file = os.path.join(root_dir,'prediction.txt')
    pred_df = pd.read_csv(prediction_file, header=None, names=['filename','preds'])

    solution_file = os.path.join(root_dir,'reference.csv')
    sol_df_A, sol_df_mimic = parse_solution_file(solution_file)

    gt_file = os.path.join(root_dir,'gt.json')
    with open(gt_file,'r') as load_f:
        gt_dict = json.load(load_f)

    if score_type == 'mimic_scores':
        mimic_scores = get_scores(pred_df=pred_df, sol_df=sol_df_mimic, mm_vals = False)
        cmp_results('mimic_scores', mimic_scores, gt_dict)
        return mimic_scores
    elif score_type == 'A_scores':
        A_scores = get_scores(pred_df=pred_df, sol_df=sol_df_A, mm_vals = True)
        cmp_results('A_scores', A_scores, gt_dict)
        return A_scores
    else:
        mimic_scores = get_scores(pred_df=pred_df, sol_df=sol_df_mimic, mm_vals = False)
        A_scores = get_scores(pred_df=pred_df, sol_df=sol_df_A, mm_vals = True)
        cmp_results('mimic_scores', mimic_scores, gt_dict)
        cmp_results('A_scores', A_scores, gt_dict)
        return mimic_scores, A_scores

def save_gt(root_dir, mimic_scores, A_scores):
    gt_dict = {'mimic_scores': mimic_scores,'A_scores': A_scores}

    with open(os.path.join(root_dir,'gt.json'),'w') as dump_f:
        json.dump(gt_dict, dump_f, indent=4)

In [13]:
# all the prediction are correct
root_dir = 'all_correct'
_,_ = validate(root_dir)


With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 0 are all non-hybrids and the ones higher are all hyrids.
Full Scores Mimic hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 0.0, 'hybrid_recall': 1.0, 'hybrid_precision': 1.0, 'f1_score': 1.0, 'accuracy': 1.0, 'prc_auc': 1.0, 'roc_auc': 1.0}
With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 0 are all non-hybrids and the ones higher are all hyrids.
Evaluating performance on signal vs non-signal hybrids
Full Scores Species A hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 0.0, 'hybrid_recall': 1.0, 'hybrid_precision': 1.0, 'f1_score': 1.0, 'accuracy': 1.0, 'prc_auc': 1.0, 'roc_auc': 1.0, 'major_recall': np.float64(1.0), 'minor_recall': np.float64(1.0), 'major_prc_auc': np.float64(1.0), 'minor_prc_auc': np.float64(1.0), 'major_roc_auc': np.float64(1.0), 'minor_roc_auc': np.float64(1.0)}
threshold_recall    matched.
threshol

In [None]:
# all the predictions are wrong
root_dir = 'all_wrong'
_,_ = validate(root_dir)

With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 1 are all non-hybrids and the ones higher are all hyrids.
Full Scores Mimic hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 1.0, 'hybrid_recall': 0.0, 'hybrid_precision': 0.0, 'f1_score': 0.0, 'accuracy': 0.5, 'prc_auc': 0.5, 'roc_auc': 0.0}
With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 1 are all non-hybrids and the ones higher are all hyrids.
Evaluating performance on signal vs non-signal hybrids
Full Scores Species A hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 1.0, 'hybrid_recall': 0.0, 'hybrid_precision': 0.0, 'f1_score': 0.0, 'accuracy': 0.5, 'prc_auc': 0.5, 'roc_auc': 0.0, 'major_recall': np.float64(0.0), 'minor_recall': np.float64(0.0), 'major_prc_auc': np.float64(0.5), 'minor_prc_auc': np.float64(0.5), 'major_roc_auc': np.float64(0.0), 'minor_roc_auc': np.float64(0.0)}
threshold_recall    matched.
threshol

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
# all the predictions are zeros
root_dir = 'all_zeros'
_,_ = validate(root_dir)

With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 0 are all non-hybrids and the ones higher are all hyrids.
Full Scores Mimic hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 0.0, 'hybrid_recall': 0.0, 'hybrid_precision': 0.0, 'f1_score': 0.0, 'accuracy': 0.5, 'prc_auc': 0.5, 'roc_auc': 0.5}
With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 0 are all non-hybrids and the ones higher are all hyrids.
Evaluating performance on signal vs non-signal hybrids
Full Scores Species A hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 0.0, 'hybrid_recall': 0.0, 'hybrid_precision': 0.0, 'f1_score': 0.0, 'accuracy': 0.5, 'prc_auc': 0.5, 'roc_auc': 0.5, 'major_recall': np.float64(0.0), 'minor_recall': np.float64(0.0), 'major_prc_auc': np.float64(0.5), 'minor_prc_auc': np.float64(0.5), 'major_roc_auc': np.float64(0.5), 'minor_roc_auc': np.float64(0.5)}
threshold_recall    matched.
threshol

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# a sample of all the prediction are correct, but the gt is wrong
root_dir = 'bad_gt'
_,_ = validate(root_dir)


With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 0 are all non-hybrids and the ones higher are all hyrids.
Full Scores Mimic hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 0.0, 'hybrid_recall': 1.0, 'hybrid_precision': 1.0, 'f1_score': 1.0, 'accuracy': 1.0, 'prc_auc': 1.0, 'roc_auc': 1.0}
With non-hybrid recall 1.0, the predictions equal and lower than the threshold confident score 0 are all non-hybrids and the ones higher are all hyrids.
Evaluating performance on signal vs non-signal hybrids
Full Scores Species A hybrid detection: {'threshold_recall': 1.0, 'threshold_pred': 0.0, 'hybrid_recall': 1.0, 'hybrid_precision': 1.0, 'f1_score': 1.0, 'accuracy': 1.0, 'prc_auc': 1.0, 'roc_auc': 1.0, 'major_recall': np.float64(1.0), 'minor_recall': np.float64(1.0), 'major_prc_auc': np.float64(1.0), 'minor_prc_auc': np.float64(1.0), 'major_roc_auc': np.float64(1.0), 'minor_roc_auc': np.float64(1.0)}
threshold_recall    should be 3.0, bu