Computing the global statistics of the predictions when we train different models from a cross-validation approach.

In [47]:
# https://towardsdatascience.com/metrics-and-python-ii-2e49597964ff
# https://en.wikipedia.org/wiki/Sensitivity_and_specificity
# en el apartado "a worked example" de wikipedia viene muy bien explicado
import os
import json
import numpy as np

from imgclas.data_utils import load_image, load_class_names
from imgclas import paths, plot_utils
from sklearn.metrics import f1_score

from imgclas import test_utils
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings("ignore") # To ignore UndefinedMetricWarning: [Recall/Precision/F-Score] is ill-defined and being set to 0.0 in labels with no [true/predicted] samples.

# User parameters to set
timestamp = ['2022-04-19_Fold1SpAnd45Balanced_18ep_stop_16Batch', 
             '2022-04-19_Fold2SpAnd45Balanced_17ep_stop_16Batch',
            '2022-04-19_Fold3SpAn45Balanced_12ep_stop_16Batch', 
             '2022-04-20_Fold4SpAnd45Balanced_35ep_stop15_16Batch',
            '2022-04-20_Fold5SpAnd45Balanced_21ep_stop15_16Batch']      # timestamp of the model
SPLIT_NAME = 'test'                   # dataset split to predict
MODEL_NAME = 'final_model.h5'         # model to use to make the mediction
TOP_K = 2                             # number of top classes predictions to save

accs = []
sens = []
specs = []
ppv=[]
npv=[]
aucs=[]
accuracy=[]
f1_scores=[]
f1_scores_sklearn=[]
prevalences=[]
for TIMESTAMP in timestamp:

    # Set the timestamp
    paths.timestamp = TIMESTAMP

    # Load clas names
    class_names = load_class_names(splits_dir=paths.get_ts_splits_dir())

    # Load back the predictions
    pred_path = os.path.join(paths.get_predictions_dir(), '{}+{}+top{}.json'.format(MODEL_NAME, SPLIT_NAME, TOP_K))
    with open(pred_path) as f:
        pred_dict = json.load(f)
    
    # accuracy
    true_lab, pred_lab = np.array(pred_dict['true_lab']), np.array(pred_dict['pred_lab'])
    top1 = test_utils.topK_accuracy(true_lab, pred_lab, K=1)
    accs.append(top1)
    
    y_pred = np.array([item[0] for item in pred_lab])
    # standard confussion matrix
    TN, FP, FN, TP = confusion_matrix(true_lab, y_pred, labels=[0, 1]).ravel()
    Population = TN+FN+TP+FP
    sensitivity  = round( TP / (TP+FN),4 ) # recall
    specificity  = round( TN / (TN+FP),4 ) 
    pos_pred_val = round( TP / (TP+FP),4 ) # precision
    neg_pred_val = round( TN / (TN+FN),4 )
    Accuracy   = round( (TP+TN) / Population,4)
    Prevalence = round( (TP+FP) / Population,2)
    F1 = round ( 2 * ((pos_pred_val*sensitivity)/(pos_pred_val+sensitivity)),4)
    accuracy.append(Accuracy)
    ppv.append(pos_pred_val)
    npv.append(neg_pred_val)
    sens.append(sensitivity)
    specs.append(specificity)
    f1_scores.append(F1)
    prevalences.append(Prevalence)
    f1_scores_sklearn.append(f1_score(true_lab, pred_lab[:, 0]))
    
    scores=[]
    for i in range(0, len(pred_dict['pred_lab'])):
        if pred_dict['pred_lab'][i][0]==0:
            scores.append(pred_dict['pred_prob'][i][1])
        else:
            scores.append(pred_dict['pred_prob'][i][0])
    auc = roc_auc_score(true_lab, scores)
    aucs.append(auc)

Loading class names...
Loading class names...
Loading class names...
Loading class names...
Loading class names...


In [54]:
np.mean(prevalences)

0.492

In [38]:
mean_accs=np.mean(accs)
sd_accs=np.std(accs)
print('Mean accuracy is {:.2f} and its mean SD is {:.2f}'.format(mean_accs,sd_accs))

Mean accuracy is 0.87 and its mean SD is 0.06


In [39]:
mean_specs=np.mean(specs)
sd_specs=np.std(specs)
print('Mean specificity is {:.2f} and its mean SD is {:.2f}'.format(mean_specs,sd_specs))

Mean specificity is 0.88 and its mean SD is 0.10


In [51]:
mean_sens=np.mean(sens)
sd_sens=np.std(sens)
print('Mean sensitivity is {:.2f} and its mean SD is {:.2f}'.format(mean_sens,sd_sens))

Mean sensitivity is 0.87 and its mean SD is 0.10


In [41]:
mean_ppv=np.mean(ppv)
sd_ppv=np.std(ppv)
print('Mean PPV is {:.2f} and its mean SD is {:.2f}'.format(mean_ppv,sd_ppv))

Mean PPV is 0.89 and its mean SD is 0.08


In [42]:
mean_npv=np.mean(npv)
sd_npv=np.std(npv)
print('Mean NPV is {:.2f} and its mean SD is {:.2f}'.format(mean_npv,sd_npv))

Mean NPV is 0.88 and its mean SD is 0.09


In [43]:
mean_auc=np.mean(aucs)
sd_auc=np.std(aucs)
print('Mean AUC is {:.2f} and its mean SD is {:.2f}'.format(mean_auc,sd_auc))

Mean AUC is 0.96 and its mean SD is 0.03


In [44]:
mean_f1_scores=np.mean(f1_scores)
sd_f1_scores=np.std(f1_scores)
print('Mean F1 score is {:.2f} and its mean SD is {:.2f}'.format(mean_f1_scores,
                                                            sd_f1_scores))

Mean AUC is 0.87 and its mean SD is 0.07


In [49]:
mean_prevalences=np.mean(prevalences)
sd_prevalences=np.std(prevalences)
print('Mean Prevalence is {:.2f} and its mean SD is {:.2f}'.format(mean_prevalences,
                                                            sd_prevalences))

Mean Prevalence is 0.49 and its mean SD is 0.08
