# File to get AUROC for a model

In [1]:
import pandas as pd 
from evaluate_models import *
import os
import torch
from utils import make_auc_plot, make_precision_recall
from utils import config, plot_auc
from matplotlib import pyplot as plt
from utils import get_roc_CI

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]='0,1'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
# helper function to plot AUROC 
# takes list of ground truth values, predictions, and labels for the plot 
def plot_auc(y_true, y_pred, labels):
    plt.tight_layout()
    colors = ['r','b','g', 'c', 'm']
    lw = 2
    fpr = []
    tpr = []
    auc = []
    n_classes = len(labels)
        
    if(n_classes == 1):
        color = colors[0]
        fpr, tpr, _ = metrics.roc_curve(y_true, y_pred)

        
        auc = metrics.roc_auc_score(y_true, y_pred)
        
        roc_curves, auc_scores, mean_fpr, tprs_lower, tprs_upper = get_roc_CI(y_true, y_pred)
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, alpha=.1, color=color)
        conf_int = ' ({:.2f}-{:.2f})'.format(np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5))
        test = 'ROC curve of ' + labels[0] + ' area = {1:0.2f}'.format(0, auc) + conf_int
        print(test)
        plt.plot(fpr, tpr, color=color, lw=lw,
                 label=test)            
    else:
        for ind in range(len(labels)):
            f, t, _ = metrics.roc_curve(np.squeeze(y_true)[:,ind], np.squeeze(y_pred)[:,ind])
            fpr.append(f)
            tpr.append(t) 
            auc.append(metrics.roc_auc_score(y_true[:,ind], y_pred[:,ind]))
    
        for i, color in zip(range(len(labels)), colors):
            roc_curves, auc_scores, mean_fpr, tprs_lower, tprs_upper = get_roc_CI(np.squeeze(y_true)[:,i], np.squeeze(y_pred)[:,i])

            plt.fill_between(mean_fpr, tprs_lower, tprs_upper, alpha=.1, color=color)
            conf_int = ' ({:.2f}-{:.2f})'.format(np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5))
            if i == 0:
                test = 'ROC curve of ' + labels[i] + ' area = {1:0.2f}'.format(i, auc[i]) + conf_int
            elif i == 1:
                test = 'ROC curve of ' + labels[i] + ' area = {1:0.2f}'.format(i, auc[i]) + conf_int
            else:
                test = 'ROC curve of ' + labels[i] + ' area = {1:0.2f}'.format(i, auc[i]) + conf_int

#             else:
#                 test = 'ROC curve of ' + labels[i] + ' area = {1:0.2f}'.format(i, auc[i])
            plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label=test)
            print(test)
    plt.axis('scaled')
    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right', fontsize = 'small')
    plt.show()
    return auc, np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)
    

# Run test set through best models on all seeds + make plots accordingly 

In [None]:
#  TODO: define model type and model name for config file 
model_type = ""
model_name = ""

# TODO: define path to log file 
df_search = pd.read_csv("").drop_duplicates(subset = ['savename'], keep = "first")

best_model_info,_ = get_best_model_across_seeds(df_search)
predictions = []
te_loader = []
valid_scores = []
pt_ids_all = []

for model_info in best_model_info:
    valid_scores.append(model_info.average_score)
    config_str = model_type + "." + model_name 
    labels = config(config_str + ".labels").split("|")
    checkpoint, model, criterion, exp = load_best_model(model_info, device, config_str, model_type, model_name)

    model.eval()

    exp._get_data_loaders(model_info['seed'], checkpoint['params'], split = "test")
    # test loader is always the last loader 
    te_loader = exp.loaders[-1]

    y_true, y_pred, pt_ids = get_test_predictions(model, criterion, device, te_loader, model_name = model_name, get_all_predictions = True)
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    pt_ids_all.append(np.unique(pt_ids))

    score, unique_predictions, unique_truth, pt_ids = calc_roc(y_true, y_pred, pt_ids)
    labels = config(config_str + ".labels").split("|")
    plot_auc(unique_truth, unique_predictions, labels)


# Statistical Significance Tests 

In [140]:
# statistical significance test takes in ground truth and predictions for two models, list of label values, and an alpha value 
def statistical_significance(y_true_first, y_pred_first, y_true_second, y_pred_second, labels, alpha = 0.05):
    
    n_classes = len(labels)
    if (n_classes == 1):
        _, auc_scores_first, _, _, _ = get_roc_CI(y_true_first, y_pred_first)
        _, auc_scores_second, _, _, _ = get_roc_CI(y_true_second, y_pred_second)
        difference = list((auc_scores_first[j] > auc_scores_second[j] for j in range(len(auc_scores_second))))
        total_greater = sum(difference)
        if total_greater / 1000 > 1 - alpha:
            print(labels[0], "statistically significant at p-value", 1 - total_greater / 1000)
        else:
            print(labels[0], "not statistically significant at p-value", 1- total_greater / 1000)
            
    else:
        for i in range(len(labels)):
            #  get AUROC for 1000 bootstrapped samples 
            _, auc_scores_first, _, _, _ = get_roc_CI(np.squeeze(y_true_first)[:,i], np.squeeze(y_pred_first)[:,i])
            _, auc_scores_second, _, _, _ = get_roc_CI(np.squeeze(y_true_second)[:,i], np.squeeze(y_pred_second)[:,i])
            difference = list((auc_scores_first[j] > auc_scores_second[j] for j in range(len(auc_scores_second))))
            total_greater = sum(difference)
            print(total_greater)
            if total_greater / 1000 > 1 - alpha:
                print(labels[i], "statistically significant at p-value", 1 - total_greater / 1000)
            else:
                print(labels[i], "not statistically significant at p-value", 1 - total_greater / 1000)
         