# Evaluation Toolkit

In [72]:
import numpy as np
import pandas as pd
# import seaborn as sns
#import matplotlib.pyplot as plt
import os

from metrics.f1_score_f1_pa import *
from metrics.fc_score import *
from metrics.precision_at_k import *
from metrics.customizable_f1_score import *
from metrics.AUC import *
from metrics.Matthews_correlation_coefficient import *
from metrics.affiliation.generics import convert_vector_to_events
from metrics.affiliation.metrics import pr_from_events
from metrics.vus.models.feature import Window
from metrics.vus.metrics import get_range_vus_roc

from pate.PATE_metric import PATE

from sklearn import metrics
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve

In [73]:
def combine_all_evaluation_scores(y_test, pred_labels, anomaly_scores):
    events_pred = convert_vector_to_events(pred_labels) # [(4, 5), (8, 9)]
    events_gt = convert_vector_to_events(y_test)     # [(3, 4), (7, 10)]
    Trange = (0, len(y_test))
    affiliation = pr_from_events(events_pred, events_gt, Trange)
    true_events = get_events(y_test)
    # _, _, _, f1_score_ori, f05_score_ori = get_accuracy_precision_recall_fscore(y_test, pred_labels)
    # f1_score_pa = get_point_adjust_scores(y_test, pred_labels, true_events)[5]
    # pa_accuracy, pa_precision, pa_recall, pa_f_score = get_adjust_F1PA(y_test, pred_labels)
    # range_f_score = customizable_f1_score(y_test, pred_labels)
    # _, _, f1_score_c = get_composite_fscore_raw(y_test, pred_labels,  true_events, return_prec_rec=True)
    # precision_k = precision_at_k(y_test, anomaly_scores, pred_labels)
    # point_auc = point_wise_AUC(pred_labels, y_test)
    # range_auc = Range_AUC(pred_labels, y_test)
    # MCC_score = MCC(y_test, pred_labels)
    # results = get_range_vus_roc(y_test, pred_labels, 200) # slidingWindow = 100 default

    
    score_list = {
                #   "f1_score_ori": f1_score_ori, 
                #   "f05_score_ori" : f05_score_ori, 
                #   "f1_score_pa": f1_score_pa,
                #   "pa_accuracy":pa_accuracy, 
                #   "pa_precision":pa_precision, 
                #   "pa_recall":pa_recall, 
                #   "pa_f_score":pa_f_score,
                #   "range_f_score": range_f_score,
                  # "f1_score_c": f1_score_c, 
                #   "precision_k": precision_k,
                #   "point_auc": point_auc,
                #   "range_auc": range_auc, 
                #   "MCC_score":MCC_score, 
                  "Affiliation precision": affiliation['precision'], 
                  "Affiliation recall": affiliation['recall'],
                  # "R_AUC_ROC": results["R_AUC_ROC"], 
                  # "R_AUC_PR": results["R_AUC_PR"],
                  # "VUS_ROC": results["VUS_ROC"], 
                  # "VUS_PR": results["VUS_PR"]
                  }
    
    return score_list

In [74]:
def adjust_predicts(label, predict=None, calc_latency=False):
    
    label = np.asarray(label)
    latency = 0
    
    actual = label > 0.1
    anomaly_state = False
    anomaly_count = 0
    for i in range(len(actual)):
        if actual[i] and predict[i] and not anomaly_state:
                anomaly_state = True
                anomaly_count += 1
                for j in range(i, 0, -1):
                    if not actual[j]:
                        break
                    else:
                        if not predict[j]:
                            predict[j] = True
                            latency += 1
        elif not actual[i]:
            anomaly_state = False
        if anomaly_state:
            predict[i] = True
        
    MCM = metrics.multilabel_confusion_matrix(actual, predict, labels = [1, 0])

    pa_tn = MCM[0][0, 0]
    pa_tp = MCM[0][1, 1]
    pa_fp = MCM[0][0, 1]
    pa_fn = MCM[0][1, 0]
        
    # pa_tn, pa_fp, pa_fn, pa_tp = confusion_matrix(actual, predict).ravel()
    prec = pa_tp / (pa_tp + pa_fp)
    rec = pa_tp / (pa_tp + pa_fn)
    if (prec+rec) == 0:
        f1_score = 0
    else:
        f1_score = 2 * (prec * rec) / (prec + rec)
    
    if calc_latency:
        return predict, latency / (anomaly_count + 1e-4), pa_tp, pa_tn, pa_fp, pa_fn, prec , rec, f1_score
    else:
        return predict, prec, rec, f1_score, pa_tp, pa_tn, pa_fp, pa_fn

In [75]:
def add_summary_statistics(res_df):
    # Compute the sum of 'best_tp', 'best_tn', 'best_fp', 'best_fn'
    sum_best_tp = res_df['best_tp'].sum()
    sum_best_tn = res_df['best_tn'].sum()
    sum_best_fp = res_df['best_fp'].sum()
    sum_best_fn = res_df['best_fn'].sum()

    # Calculate precision, recall and f1 score
    precision = sum_best_tp / (sum_best_tp + sum_best_fp) if (sum_best_tp + sum_best_fp) > 0 else 0
    recall = sum_best_tp / (sum_best_tp + sum_best_fn) if (sum_best_tp + sum_best_fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Calculate the average and std of 'roc' and 'pr'
    roc_avg = res_df['roc'].mean()
    roc_std = res_df['roc'].std()
    pr_avg = res_df['pr'].mean()
    pr_std = res_df['pr'].std()


    sum_pa_tp = res_df['pa_tp'].sum()
    sum_pa_tn = res_df['pa_tn'].sum()
    sum_pa_fp = res_df['pa_fp'].sum()
    sum_pa_fn = res_df['pa_fn'].sum()

    # Calculate precision, recall and f1 score
    pa_precision = sum_pa_tp / (sum_pa_tp + sum_pa_fp) if (sum_pa_tp + sum_pa_fp) > 0 else 0
    pa_recall = sum_pa_tp / (sum_pa_tp + sum_pa_fn) if (sum_pa_tp + sum_pa_fn) > 0 else 0
    pa_f1_score = 2 * (pa_precision * pa_recall) / (pa_precision + pa_recall) if (pa_precision + pa_recall) > 0 else 0



    aff_f1_mean = res_df['aff_f1'].mean()
    aff_f1_std = res_df['aff_f1'].std()

    R_AUC_ROC_mean = res_df['R_AUC_ROC'].mean()
    R_AUC_ROC_std = res_df['R_AUC_ROC'].std()

    R_AUC_PR_mean = res_df['R_AUC_PR'].mean()
    R_AUC_PR_std = res_df['R_AUC_PR'].std()

    VUS_ROC_mean = res_df['VUS_ROC'].mean()
    VUS_ROC_std = res_df['VUS_ROC'].std()

    VUS_PR_mean = res_df['VUS_PR'].mean()
    VUS_PR_std = res_df['VUS_PR'].std()

    PATE_mean = res_df['PATE'].mean()
    PATE_std = res_df['PATE'].std()

    PATE_F1_mean = res_df['PATE_F1'].mean()
    PATE_F1_std = res_df['PATE_F1'].std()

    summary_row = pd.Series({
        'best_pre': precision,
        'best_rec': recall,
        'b_f_1': f1_score,
        'roc': roc_avg,
        'roc_std': roc_std,
        'pr': pr_avg,
        'pr_std': pr_std,
        # 'pa_pre': pa_precision,
        # 'pa_rec': pa_recall,
        # 'pa_f1': pa_f1_score,
        'aff_f1_mean': aff_f1_mean,
        'aff_f1_std': aff_f1_std,
        # 'R_AUC_ROC_mean': R_AUC_ROC_mean,
        # 'R_AUC_ROC_std': R_AUC_ROC_std,
        # 'R_AUC_PR_mean': R_AUC_PR_mean,
        # 'R_AUC_PR_std': R_AUC_PR_std,
        # 'VUS_ROC_mean': VUS_ROC_mean,
        # 'VUS_ROC_std': VUS_ROC_std,
        # 'VUS_PR_mean': VUS_PR_mean,
        # 'VUS_PR_std': VUS_PR_std,
        'PATE_mean': PATE_mean,
        'PATE_std': PATE_std,
        'PATE_F1_mean': PATE_F1_mean,
        'PATE_F1_std': PATE_F1_std

    })
    # Append the results to the dataframe
    # summary_row = pd.Series({
    #     'best_tp': sum_best_tp,
    #     'best_tn': sum_best_tn,
    #     'best_fp': sum_best_fp,
    #     'best_fn': sum_best_fn,
    #     'best_pre': precision,
    #     'best_rec': recall,
    #     'b_f_1': f1_score,
    #     'roc': roc_avg,
    #     'roc_std': roc_std,
    #     'pr': pr_avg,
    #     'pr_std': pr_std,
    #     'pa_tp': sum_pa_tp,
    #     'pa_tn': sum_pa_tn,
    #     'pa_fp': sum_pa_fp,
    #     'pa_fn': sum_pa_fn,
    #     'pa_pre': pa_precision,
    #     'pa_rec': pa_recall,
    #     'pa_f1': pa_f1_score,
    #     'aff_f1_mean': aff_f1_mean,
    #     'aff_f1_std': aff_f1_std,
    #     'R_AUC_ROC_mean': R_AUC_ROC_mean,
    #     'R_AUC_ROC_std': R_AUC_ROC_std,
    #     'R_AUC_PR_mean': R_AUC_PR_mean,
    #     'R_AUC_PR_std': R_AUC_PR_std,
    #     'VUS_ROC_mean': VUS_ROC_mean,
    #     'VUS_ROC_std': VUS_ROC_std,
    #     'VUS_PR_mean': VUS_PR_mean,
    #     'VUS_PR_std': VUS_PR_std

    # })

    
    return summary_row

In [76]:
res_df = pd.DataFrame(columns=['name', 'roc', 'pr', 'best_tp', 'best_tn', 'best_fp', 'best_fn', 'best_pre', 'best_rec', 
                                'b_f_1', 'pa_tp', 'pa_tn', 'pa_fp', 'pa_fn', 'pa_pre', 'pa_rec', 'pa_f1', 'latency', 'aff_pre', 'aff_rec', 'aff_f1', 'R_AUC_ROC', 'R_AUC_PR', 'VUS_ROC', 'VUS_PR', 'PATE', 'PATE_F1']) 


ds_name = 'smd'  #'MSL'
path = os.path.join('results/', ds_name)
file_list = os.listdir(path)
file_list = sorted(file_list)


for filename in file_list: #data_info['chan_id']: #files: #data_info['chan_id']: #['uni-ts-all.csv']:
    if filename!='GECCO' and filename!='.json': #'Power' in filename : # and ('real_' in filename) : #'train' in filename:
        print(filename)
        df_train = pd.read_csv(f"results/{ds_name}/{filename}/classification/classification_trainprobs.csv")
        df_test = pd.read_csv(f"results/{ds_name}/{filename}/classification/classification_testprobs.csv")
        cl_num = df_train.shape[1] - 1

        df_train['Class'] = np.where((df_train['Class'] == 0), 0, 1)
        df_train['pred']=df_train[df_train.columns[0:cl_num]].idxmax(axis=1)

        score_col = df_train['pred'].value_counts().idxmax()
        
        df_test['Class'] = np.where((df_test['Class'] == 0), 0, 1)
        df_test['pred'] = df_test[df_test.columns[0:cl_num]].idxmax(axis=1)
#         print(df_test['pred'].value_counts())
        
#         score_col = df_test['pred'].value_counts().idxmax()
        #print('score col: ', score_col)
        
        roc_auc, pr_auc, best_tn, best_tp, best_fp, best_fn, best_pre, best_rec, best_f1 = 0, 0, 0, 0, 0, 0, 0, 0, 0
        try:

            df_test['pred'] = np.where((df_test['pred'] == score_col), 0, 1)

            MCM = metrics.multilabel_confusion_matrix(df_test['Class'], df_test['pred'], labels = [1, 0])

            tn = MCM[0][0, 0]
            tp = MCM[0][1, 1]
            fp = MCM[0][0, 1]
            fn = MCM[0][1, 0]

            if (tp+fp) == 0:
                pre = 0
            else:
                pre=tp/(tp+fp)
            if (tp+fn) == 0:
                recall = 0
            else:
                recall = tp/(tp+fn)
            if (pre + recall) == 0:
                f_1 = 0
            else:
                f_1 = 2*pre*recall/(pre+recall)
            # print('f-1 : ', f_1)

            scores = 1-df_test[score_col]

            roc_auc = roc_auc_score(df_test['Class'], scores)
            pr_auc = average_precision_score(df_test['Class'], scores)

            fpr, tpr, thresholds = roc_curve(df_test['Class'], scores, pos_label=1)
            precision, recall, thresholds = precision_recall_curve(df_test['Class'], scores, pos_label=1)

            res = pd.DataFrame()
            res['pre'] = precision
            res['rec'] = recall
            res['f1'] = 2*res['pre']*res['rec'] / (res['pre']+res['rec'])
            best_idx = res['f1'].argmax()
            best_f1 = res['f1'][best_idx]
            best_pre = res['pre'][best_idx]
            best_rec = res['rec'][best_idx]
            best_thr = thresholds[best_idx]
            print('Best f1 : ', best_f1, 'best_thr', best_thr)
            anomalies = [True if s >= best_thr else False for s in scores]

            best_tn, best_fp, best_fn, best_tp = confusion_matrix(df_test['Class'], anomalies).ravel()

            # print(type(df_test['Class'].values), type(np.array(anomalies)), type(scores.values))

            all_evals = combine_all_evaluation_scores(df_test['Class'].values, np.array(anomalies), scores.values)
            
            aff_pre = all_evals['Affiliation precision']
            aff_rec = all_evals['Affiliation recall']
            aff_f1 = 2 * (aff_pre * aff_rec) / (aff_pre + aff_rec) if (aff_pre + aff_rec) > 0 else 0
            # print(all_evals)

            R_AUC_ROC = 0 #all_evals['R_AUC_ROC']
            R_AUC_PR = 0 #all_evals['R_AUC_PR']
            VUS_ROC = 0 #all_evals['VUS_ROC']
            VUS_PR = 0 #all_evals['VUS_PR']


            pate = PATE(df_test['Class'].values, scores.values, binary_scores = False, n_jobs=-1)
            pate_f1 = PATE(df_test['Class'].values, np.array(anomalies), binary_scores = True, n_jobs=-1)
            print('PATE: ', pate, '\t PATE_F1: ', pate_f1)

        except ValueError:
            print('ERROR ++++++++++++++')
            pass

        
        pa_f1 = -1
        for thr in thresholds:
            preds_pa = [True if s >= thr else False for s in scores]
            pa_prediction, t_latency, t_tp, t_tn, t_fp, t_fn, t_pre, t_rec, t_f1 = adjust_predicts(df_test['Class'], preds_pa, True)
            if t_f1 > pa_f1:
                latency, pa_tp, pa_tn, pa_fp, pa_fn, pa_pre, pa_rec, pa_f1 = t_latency, t_tp, t_tn, t_fp, t_fn, t_pre, t_rec, t_f1
        
        new_row = pd.Series([filename, roc_auc, pr_auc, best_tp, best_tn, best_fp, best_fn, best_pre, best_rec, best_f1,
                                pa_tp, pa_tn, pa_fp, pa_fn, pa_pre, pa_rec, pa_f1, latency, aff_pre, aff_rec, aff_f1, R_AUC_ROC, R_AUC_PR, VUS_ROC, VUS_PR, pate, pate_f1],
                                index=['name', 'roc', 'pr', 'best_tp', 'best_tn', 'best_fp', 'best_fn', 'best_pre', 'best_rec', 
                                'b_f_1', 'pa_tp', 'pa_tn', 'pa_fp', 'pa_fn', 'pa_pre', 'pa_rec', 'pa_f1', 'latency', 'aff_pre', 'aff_rec', 'aff_f1', 'R_AUC_ROC', 'R_AUC_PR', 'VUS_ROC', 'VUS_PR', 'PATE', 'PATE_F1'])
        
        
        res_df = res_df._append(new_row, ignore_index=True) 
                
final_results = add_summary_statistics(res_df)
res_df.to_csv(path+'/'+f'{ds_name}_cuda_allmetrics.csv')

machine-1-1.txt
Best f1 :  0.2632063882063882 best_thr 0.6299153566360474
PATE:  0.11019463894050766 	 PATE_F1:  0.32004696919231834
machine-1-2.txt
Best f1 :  0.298804780876494 best_thr 0.6317599415779114
PATE:  0.28734535372957065 	 PATE_F1:  0.33005168307008603
machine-1-3.txt
Best f1 :  0.3443223443223443 best_thr 0.6337321102619171
PATE:  0.4244869306974516 	 PATE_F1:  0.41217176863187116
machine-1-4.txt
Best f1 :  0.23403456048084145 best_thr 0.6230305433273315
PATE:  0.6285280487629301 	 PATE_F1:  0.3370193625799214
machine-1-5.txt
Best f1 :  0.5995525727069352 best_thr 0.6344813108444214
PATE:  0.5909349848907846 	 PATE_F1:  0.6123066022224298
machine-1-6.txt
Best f1 :  0.5872768244063095 best_thr 0.6300226151943207
PATE:  0.5640559715083288 	 PATE_F1:  0.6943359284929577
machine-1-7.txt
Best f1 :  0.3714026052711299 best_thr 0.6313755214214325
PATE:  0.38970151717713447 	 PATE_F1:  0.44641031539246256
machine-1-8.txt
Best f1 :  0.43360695316272335 best_thr 0.6269603073596954
P

NotADirectoryError: [Errno 20] Not a directory: 'results/smd/smd_cuda_allmetrics.csv/classification/classification_trainprobs.csv'

In [77]:
res_df

Unnamed: 0,name,roc,pr,best_tp,best_tn,best_fp,best_fn,best_pre,best_rec,b_f_1,...,latency,aff_pre,aff_rec,aff_f1,R_AUC_ROC,R_AUC_PR,VUS_ROC,VUS_PR,PATE,PATE_F1
0,machine-1-1.txt,0.354332,0.120254,857,0,4798,0,0.151547,1.0,0.263206,...,0.124998,0.579039,1.0,0.733407,0,0,0,0,0.110195,0.320047
1,machine-1-2.txt,0.559463,0.234482,150,3844,363,341,0.292398,0.305499,0.298805,...,19.333119,0.685693,0.985504,0.808705,0,0,0,0,0.287345,0.330052
2,machine-1-3.txt,0.723843,0.351909,423,2666,1395,216,0.232673,0.661972,0.344322,...,19.499675,0.641788,0.911453,0.753212,0,0,0,0,0.424487,0.412172
3,machine-1-4.txt,0.532905,0.189614,623,0,4078,0,0.132525,1.0,0.234035,...,4.499775,0.53027,1.0,0.693041,0,0,0,0,0.628528,0.337019
4,machine-1-5.txt,0.812249,0.539158,134,4388,13,166,0.911565,0.446667,0.599553,...,2.249944,0.908163,0.663656,0.766893,0,0,0,0,0.590935,0.612307
5,machine-1-6.txt,0.608312,0.458777,1694,622,2189,192,0.436261,0.898197,0.587277,...,9.380908,0.624268,0.979533,0.762552,0,0,0,0,0.564056,0.694336
6,machine-1-7.txt,0.573761,0.314511,613,2011,1729,346,0.261742,0.639208,0.371403,...,6.333228,0.550834,0.940098,0.69465,0,0,0,0,0.389702,0.44641
7,machine-1-8.txt,0.660037,0.317157,449,3077,709,464,0.387737,0.491785,0.433607,...,4.133306,0.757026,0.835293,0.794236,0,0,0,0,0.439972,0.514067
8,machine-2-1.txt,0.58828,0.334617,243,3343,603,509,0.287234,0.323138,0.30413,...,2.666622,0.603477,0.824225,0.696786,0,0,0,0,0.43107,0.382867
9,machine-2-2.txt,0.689107,0.442504,540,2868,841,450,0.391021,0.545455,0.455504,...,15.555383,0.782479,0.988526,0.873516,0,0,0,0,0.502606,0.496429


In [78]:
add_summary_statistics(res_df)

best_pre        0.302855
best_rec        0.605069
b_f_1           0.403664
roc             0.646282
roc_std         0.123066
pr              0.370196
pr_std          0.176363
aff_f1_mean     0.756638
aff_f1_std      0.131098
PATE_mean       0.481722
PATE_std        0.180241
PATE_F1_mean    0.489427
PATE_F1_std     0.166605
dtype: float64