In [1]:
#Basic Tools
import os, sys, glob
import pandas as pd
import numpy as np
import random

#import other classifier tools
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load original data
classification_dir = "./Final_ResampledClassificationData.csv"
SelectedFeat_dir = "./results/LASSO_SelectedFeatures.csv"
report_dir = "./results"

# find all prediction files
filelist = glob.glob("./results/classification/*_predictions_test.csv")
# print(filelist)

In [3]:
#Set Data
classify_df = pd.read_csv(classification_dir)
features = pd.read_csv(SelectedFeat_dir)

#Get Features
ft = features["Feature"].values.tolist()
num_ft = len(ft)

ft = ["File", "unique_scandate","unique_pt","NE_Score", "NE_Status", "NE_Class", "Group"] + ft
feat_data = classify_df[ft]

In [4]:
#Get Group DataFrames
train_df = feat_data[feat_data["Group"] == "train"]
val_df = feat_data[feat_data["Group"] == "val"]
test_df = feat_data[feat_data["Group"] == "test"]
merge_train = feat_data[feat_data["Group"] != "test"]

#Get Test Set variables 
X_test = test_df[test_df.columns[-num_ft:].values]
Y_test = test_df[test_df.columns[5]]
Y_files = test_df["File"].tolist()
Y_scans = test_df["unique_scandate"].tolist()
Y_pts = test_df["unique_pt"].tolist()

#Get Train and Val variables 
X_merge = merge_train[merge_train.columns[-num_ft:].values]
Y_merge = merge_train[merge_train.columns[5]]

In [5]:
# create a matrix of 1000 bootstrap samples selecting only one series from each unique_scandate
# we select this variable because our labels (NE status) are unique to each scan date (i.e. patients can have multiple labels across multiple scan dates)

all_scandates = np.unique(Y_scans)
print(len(all_scandates))

num_bootstrap_samples = 1000
bootstrap_samples = []
np.random.seed(822)
for _ in range(num_bootstrap_samples):
    bootstrap_sample = []
    # for all scan dates, randomly select one series
    for _date in all_scandates:
        condition = (test_df['unique_scandate'] == _date)
        all_series = test_df.loc[condition, 'File']
        _sample = np.random.choice(all_series.tolist(), size=1, replace=False)
        bootstrap_sample.append(_sample.item())
    bootstrap_samples.append(bootstrap_sample)


26


In [6]:
bootstrap_metrics_dict = {"model":[],"Accuracy_mean":[], "PPV_mean":[], "Sensitivity_mean":[], "NPV_mean":[], "Specificity_mean":[], "FPR_mean":[], "F1_Score_mean":[], "AUC_mean":[],
                         "Accuracy_low95":[], "PPV_low95":[], "Sensitivity_low95":[], "NPV_low95":[], "Specificity_low95":[], "FPR_low95":[], "F1_Score_low95":[], "AUC_low95":[],
                         "Accuracy_high95":[], "PPV_high95":[], "Sensitivity_high95":[], "NPV_high95":[], "Specificity_high95":[], "FPR_high95":[], "F1_Score_high95":[], "AUC_high95":[]}

for f in filelist:
    print(f)
    pred_df = pd.read_csv(f)
    metrics_dict = {"Accuracy":[], "PPV":[], "Sensitivity":[], "NPV":[], "Specificity":[], "FPR":[], "F1_Score":[], "AUC":[]}
    for _i in range(0,1000):
        boot_df = pred_df[pred_df['File'].isin(bootstrap_samples[_i])]
        Y_test = boot_df['gt'].tolist()
        ypred = boot_df['pred_bin'].tolist()
        ypred_prob = boot_df['pred_contin'].tolist()
        
        #calculate metrics  
        acc = accuracy_score(Y_test, ypred)
        ppv = precision_score(Y_test, ypred)
        CM = confusion_matrix(Y_test, ypred)
        f1 = f1_score(Y_test, ypred)
        sens = recall_score(Y_test, ypred)
        tn, fp, fn, tp = confusion_matrix(Y_test, ypred).ravel()
        spec = tn / (tn+fp)
        FPR = fp/(fp+tn)
        try:
            npv = tn/(tn + fn)
        except:
            npv = np.nan
        
        #append
        metrics_dict["Accuracy"].append(acc)
        metrics_dict["PPV"].append(ppv)
        metrics_dict["Sensitivity"].append(sens)
        metrics_dict["Specificity"].append(spec)
        metrics_dict["FPR"].append(FPR)
        metrics_dict["F1_Score"].append(f1)
        metrics_dict["NPV"].append(npv)
        
        #ROC AUC analysis
        if "HardVote" not in f: 
            auc = roc_auc_score(Y_test, ypred_prob)
        else: 
            auc = np.nan
        metrics_dict["AUC"].append(auc)
        
    # now calculate summary statistics from all 1000 bootstrap samples (mean and 95% CI)
    acc_bs = metrics_dict["Accuracy"]
    ppv_bs = metrics_dict["PPV"]
    sens_bs = metrics_dict["Sensitivity"]
    spec_bs = metrics_dict["Specificity"]
    fpr_bs = metrics_dict["FPR"]
    f1_bs = metrics_dict["F1_Score"]
    npv_bs = metrics_dict["NPV"]
    auc_bs = metrics_dict["AUC"]

    # add to final saved dictionary
    bootstrap_metrics_dict["model"].append(f)
    bootstrap_metrics_dict["Accuracy_mean"].append(np.nanmean(acc_bs))
    bootstrap_metrics_dict["Accuracy_low95"].append(np.nanquantile(acc_bs, 0.025))
    bootstrap_metrics_dict["Accuracy_high95"].append(np.nanquantile(acc_bs, 0.975))
    bootstrap_metrics_dict["PPV_mean"].append(np.nanmean(ppv_bs))
    bootstrap_metrics_dict["PPV_low95"].append(np.nanquantile(ppv_bs, 0.025))
    bootstrap_metrics_dict["PPV_high95"].append(np.nanquantile(ppv_bs, 0.975))
    bootstrap_metrics_dict["Sensitivity_mean"].append(np.nanmean(sens_bs))
    bootstrap_metrics_dict["Sensitivity_low95"].append(np.nanquantile(sens_bs, 0.025))
    bootstrap_metrics_dict["Sensitivity_high95"].append(np.nanquantile(sens_bs, 0.975))
    bootstrap_metrics_dict["Specificity_mean"].append(np.nanmean(spec_bs))
    bootstrap_metrics_dict["Specificity_low95"].append(np.nanquantile(spec_bs, 0.025))
    bootstrap_metrics_dict["Specificity_high95"].append(np.nanquantile(spec_bs, 0.975))
    bootstrap_metrics_dict["FPR_mean"].append(np.nanmean(fpr_bs))
    bootstrap_metrics_dict["FPR_low95"].append(np.nanquantile(fpr_bs, 0.025))
    bootstrap_metrics_dict["FPR_high95"].append(np.nanquantile(fpr_bs, 0.975))
    bootstrap_metrics_dict["NPV_mean"].append(np.nanmean(npv_bs))
    bootstrap_metrics_dict["NPV_low95"].append(np.nanquantile(npv_bs, 0.025))
    bootstrap_metrics_dict["NPV_high95"].append(np.nanquantile(npv_bs, 0.975))
    bootstrap_metrics_dict["F1_Score_mean"].append(np.nanmean(f1_bs))
    bootstrap_metrics_dict["F1_Score_low95"].append(np.nanquantile(f1_bs, 0.025))
    bootstrap_metrics_dict["F1_Score_high95"].append(np.nanquantile(f1_bs, 0.975))
    bootstrap_metrics_dict["AUC_mean"].append(np.nanmean(auc_bs))
    bootstrap_metrics_dict["AUC_low95"].append(np.nanquantile(auc_bs, 0.025))
    bootstrap_metrics_dict["AUC_high95"].append(np.nanquantile(auc_bs, 0.975))

b_df = pd.DataFrame(bootstrap_metrics_dict)
b_df.to_csv(os.path.join(report_dir, "bootstrap_test_all_models.csv"), index = False)

./results/classification\Unweighted_LogisticRegression_Fold1_predictions_test.csv
./results/classification\Unweighted_LogisticRegression_Fold2_predictions_test.csv
./results/classification\Unweighted_LogisticRegression_Fold3_predictions_test.csv
./results/classification\Unweighted_LogisticRegression_Fold4_predictions_test.csv
./results/classification\Unweighted_LogisticRegression_Fold5_predictions_test.csv
./results/classification\Unweighted_LogisticRegression_HardVote_Ensemble_predictions_test.csv
./results/classification\Unweighted_LogisticRegression_SoftVote_Ensemble_predictions_test.csv
./results/classification\Unweighted_RandomForest_Fold1_predictions_test.csv
./results/classification\Unweighted_RandomForest_Fold2_predictions_test.csv
./results/classification\Unweighted_RandomForest_Fold3_predictions_test.csv
./results/classification\Unweighted_RandomForest_Fold4_predictions_test.csv
./results/classification\Unweighted_RandomForest_Fold5_predictions_test.csv
./results/classificati