Analysis of the models obtain previously

In [None]:
import sys
sys.path.append('../')
from utils import *

In [None]:
labs = pd.read_csv('data/labs_1_day.csv', index_col = [0, 1], header = [0, 1])
outcomes = pd.read_csv('data/outcomes_1_day.csv', index_col = 0)
outcomes['Death'] = ~outcomes.Death.isna()

In [None]:
group_name = 'ethnicity' # gender or ethnicity
ethnicity = (outcomes.ETHNICITY == 'WHITE').replace({True: 'White', False: 'Non white'}) 
gender = (outcomes.GENDER == 'M').replace({True: 'Male', False: 'Female'})
if group_name == 'ethnicity':
    groups = ethnicity
elif group_name == 'gender':
    groups = gender
groups_unique = groups.unique()

In [None]:
results = 'results/'

# Open results

In [None]:
import os

In [None]:
names = {
    'classification_reg_LOCF': 'LOCF',
    'classification_reg_Individual': 'Individual Median',
    'classification_reg_MICE': 'Population MICE',
    'classification_reg_gender_specific': 'Gender MICE',
    'classification_reg_ethnicity_specific': 'Ethnicity MICE',
}

In [None]:
predictions = {}

for file in sorted(os.listdir(results)):
    if '.csv' not in file:
        continue
    name = file[:file.index('.csv')]
    predictions[names[name]] = pd.read_csv(results + file, index_col=0)
    print(file, ' -> ', name)

# Performances

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, brier_score_loss

Differencesin observed labels between training and testing 

### All metrics

Evaluate all metrics on datasets

In [None]:
def evaluate(y_true, groups, y_pred, iterations = 100):
    """
        Compute boostrapped performances
    """
    groups_unique = np.unique(groups).tolist() + ["Overall"]
    fprs, tprs, rocs, brs = {b: [] for b in groups_unique}, {b: [] for b in groups_unique}, \
        {b: [] for b in groups_unique}, {b: [] for b in groups_unique}

    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    fpr_sort = np.argsort(fpr)
    tpr_sort = np.argsort(tpr)
    threshold_fpr = np.interp(0.9, tpr[tpr_sort], thresholds[tpr_sort])
    threshold_tpr = np.interp(0.1, fpr[fpr_sort], thresholds[fpr_sort])

    for group in groups_unique:
        if group == 'Overall':
            y_pred_group = y_pred
            y_true_group = y_true
        else:
            y_pred_group = y_pred[groups == group]
            y_true_group = y_true[groups == group]
        for i in range(iterations):
            bootstrap = np.random.choice(np.arange(len(y_pred_group)), size = len(y_pred_group), replace = True) 
            y_pred_iteration = y_pred_group[bootstrap]
            y_true_group_iter = y_true_group[bootstrap]

            brs[group].append(brier_score_loss(y_true_group_iter, y_pred_iteration))
            fpr, tpr, thresholds = roc_curve(y_true_group_iter, y_pred_iteration)
            thres_order = np.argsort(thresholds)
            fprs[group].append(np.interp(threshold_fpr, thresholds[thres_order], fpr[thres_order]))
            tprs[group].append(np.interp(threshold_tpr, thresholds[thres_order], tpr[thres_order]))
            rocs[group].append(roc_auc_score(y_true_group_iter, y_pred_iteration))

    difference = 'Difference {} - {}'.format(groups_unique[0], groups_unique[1])
    result = {
        (difference, "Brier Score", 'Mean'): np.mean(np.array(brs[groups_unique[0]]) - np.array(brs[groups_unique[1]])),
        (difference, "Brier Score", 'Std'): np.std(np.array(brs[groups_unique[0]]) - np.array(brs[groups_unique[1]])),
        (difference, "AUC ROC", 'Mean'): np.mean(np.array(rocs[groups_unique[0]]) - np.array(rocs[groups_unique[1]])),
        (difference, "AUC ROC", 'Std'): np.std(np.array(rocs[groups_unique[0]]) - np.array(rocs[groups_unique[1]])),
        (difference, "FPR @ 90% TPR", 'Mean'): np.mean(np.array(fprs[groups_unique[0]]) - np.array(fprs[groups_unique[1]])),
        (difference, "FPR @ 90% TPR", 'Std'): np.std(np.array(fprs[groups_unique[0]]) - np.array(fprs[groups_unique[1]])),
        (difference, "TPR @ 10% FPR", 'Mean'): np.mean(np.array(tprs[groups_unique[0]]) - np.array(tprs[groups_unique[1]])),
        (difference, "TPR @ 10% FPR", 'Std'): np.std(np.array(tprs[groups_unique[0]]) - np.array(tprs[groups_unique[1]])),
    }
    for group in groups_unique:
        result.update({
            (group, "Brier Score", 'Mean'): np.mean(brs[group]),
            (group, "Brier Score", 'Std'): np.std(brs[group]),
            (group, "AUC ROC", 'Mean'): np.mean(rocs[group]),
            (group, "AUC ROC", 'Std'): np.std(rocs[group]),
            (group, "FPR @ 90% TPR", 'Mean'): np.mean(fprs[group]),
            (group, "FPR @ 90% TPR", 'Std'): np.std(fprs[group]),
            (group, "TPR @ 10% FPR", 'Mean'): np.mean(tprs[group]),
            (group, "TPR @ 10% FPR", 'Std'): np.std(tprs[group]),
        })
    return pd.Series(result)

In [None]:
# Compute and display performances per group of model
performances = {}
for m in predictions:
    print('-' * 42)
    print(m)
    performances[m] = {}

    np.random.seed(42)
    preds = predictions[m]

    test = preds.Use != 'Train' # Use the data that will be used for both   
    test = test[test].index
    
    performances[m] = evaluate(outcomes.Death.loc[test].values, groups.loc[test].values, preds.loc[test]['1'].values)
performances = pd.concat(performances, 1).T

In [None]:
performances

In [None]:
perf = performances.loc[:, ~performances.columns.get_level_values(0).str.contains('Difference')] # Ignore difference
col = perf.columns.get_level_values(1)
order = performances['Overall']['AUC ROC']["Mean"].sort_values().index
for metric in col.unique():
    perf_metric = perf.loc[:, col == metric].droplevel(1, 1)
    perf_metric_mean = perf_metric.loc[:, perf_metric.columns.get_level_values(1) == "Mean"].droplevel(1, 1)
    perf_metric_std = 1.96 * perf_metric.loc[:, perf_metric.columns.get_level_values(1) == "Std"].droplevel(1, 1) / np.sqrt(100)

    perf_metric_mean = perf_metric_mean.loc[order]
    perf_metric_std = perf_metric_std.loc[order]
    ax = perf_metric_mean.T.plot.barh(yerr = perf_metric_std.T)

    plt.title('{}'.format(metric))
    plt.grid(alpha = 0.3)
    plt.xlabel('Performance')
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(reversed(handles), reversed(labels), loc='center left', bbox_to_anchor=(1, 0.5), )

    if 'ROC' in metric:
        plt.xlim(0.6, 0.8)
    elif 'rier' in metric:
        plt.xlim(0.10, 0.14)
    else:
        plt.xlim(0., 1.1)

    plt.show()

In [None]:
perf = performances.loc[:, performances.columns.get_level_values(0).str.contains('Difference')].droplevel(0, 1)
col = perf.columns.get_level_values(0)
order = performances['Overall']['AUC ROC']["Mean"].sort_values().index
for metric in col.unique():
    perf_metric = perf.loc[:, col == metric]
    perf_metric = perf_metric.loc[:, perf_metric.columns.get_level_values(1) == "Mean"].droplevel(1, 1)
    perf_metric.loc[order].plot.barh(legend = False)

    plt.title('{} difference'.format(metric))
    plt.grid(alpha = 0.3)
    plt.xlabel('Performance {} - {}'.format(groups_unique[0], groups_unique[1]))
    plt.xlim(-0.15, 0.15)