# Out of Distribution (External) Evaluation

In [11]:

import os
from typing import *

import joblib
import numpy as np
import pandas as pd
from mimic_pipeline.metric import get_calibration_curve, get_model_size
from sklearn.metrics import (auc, brier_score_loss, precision_recall_curve,
                             roc_curve)


def ood_evaluate(eicu_df, model: str, exp: str, save: bool=False) -> Optional[dict]:
    print(f"Evaluating OOD performance for {model}...")
    if model in ['sapsii_prob', 'oasis_prob', 'apache_iv_prob', 'apache_iva_prob']:
        pass
    else:
        trained_model = joblib.load(f"models/{exp}/model.joblib")
    X_test, y_test = eicu_df.drop(['uniquepid', 'patientunitstayid', 'hospital_expire_flag', 'apache_iv_prob', 'apache_iva_prob', 'oasis_prob', 'sapsii_prob'], axis=1), eicu_df['hospital_expire_flag']
    if exp == 'oasis+' or model == 'fasterrisk-oasis':
        oasis_features = [
            'heartrate_min', 'heartrate_max', 'meanbp_min', 'meanbp_max', 'resprate_min', 'resprate_max', 'tempc_min', 
            'tempc_max', 'urineoutput', 'mechvent', 'electivesurgery', 'age', 'gcs_min', 'preiculos'
        ]
        X_test = X_test[oasis_features]
    
    if 'fasterrisk' in model or model in ['nonlinear-logreg-l1', 'nonlinear-logreg-l2']:
        binarizer = joblib.load(f"models/{exp}/binarizer.joblib")
        X_test, _ = binarizer.transform(X_test)
        y_prob = trained_model.predict_proba(X_test.to_numpy())
    elif model in ['sapsii_prob', 'oasis_prob', 'apache_iv_prob', 'apache_iva_prob']:
        y_prob = eicu_df[model]
    else:
        imputer = joblib.load(f"models/{exp}/{model}-imputer")
        scaler = joblib.load(f"models/{exp}/{model}-scaler")
        columns = list(X_test.columns)
        X_test = imputer.transform(X_test)
        X_test = pd.DataFrame(X_test, columns=columns)
        X_test = scaler.transform(X_test)
        y_prob = trained_model.predict_proba(X_test)
    
    if len(y_prob.shape) == 2:        # for some scikit-learn models where probas is 2D
        y_prob = y_prob[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auroc = auc(fpr, tpr)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    auprc = auc(recall, precision)
    prob_true, prob_pred, h_stat, p1 = get_calibration_curve(y_test, y_prob)
    _, _, c_stat, p2 = get_calibration_curve(y_test, y_prob, strategy='quantile')
    brier = brier_score_loss(y_test, y_prob)
    smr = np.sum(y_test.replace({-1: 0})) / np.sum(y_prob)
    if model not in ['sapsii_prob', 'oasis_prob', 'apache_iv_prob', 'apache_iva_prob']:
        complexity = get_model_size(trained_model)
    else:
        complexity = 'NA'
    
    print(f"AUROC: {auroc}\nAUPRC: {auprc}\nHosmer Lemeshow H stat: {h_stat}, P-Value: {p1}\nHosmer Lemeshow C stat: {c_stat}, P-Value: {p2}\nBrier Score: {brier}\nSMR: {smr}\nComplexity: {complexity}\n")
    
    stats = {
        "auroc": auroc, "auprc": auprc, "precision": precision, "recall": recall, "fpr": fpr, "tpr": tpr, 
        "true_prob": prob_true, "pred_prob": prob_pred, "h-stat": h_stat, "h-p-value": p1, "c-stat": c_stat, 
        "c-p-value": p2, "brier": brier, "smr": smr, 'complexity': complexity,
    }
    
    if save:
        if model not in ['sapsii_prob', 'oasis_prob', 'apache_iv_prob', 'apache_iva_prob']:
            os.makedirs(f"results/{exp}", exist_ok=True)
            joblib.dump(stats, f"results/{exp}/{model}-ood-stats")
        else:
            joblib.dump(stats, f"results/{model}-ood-stats")
    else:
        return stats
    

## FasterRisk

In [2]:
eicu_df = pd.read_csv('data/eICU-union-excluded-cmo.csv')

In [12]:
for gs in [10, 15, 40]:
    ood_evaluate(eicu_df, model=f'fasterrisk-{gs}_o', exp=f'fasterrisk-{gs}_o', save=True)

Evaluating OOD performance for fasterrisk-10_o...
AUROC: 0.8376235045069208
AUPRC: 0.38820668933665486
Hosmer Lemeshow H stat: 5776.436364443756, P-Value: 0.0
Hosmer Lemeshow C stat: 5514.463281919735, P-Value: 0.0
Brier Score: 0.06644746812241369
SMR: 0.5221547739626612
Complexity: 42

Evaluating OOD performance for fasterrisk-15_o...
AUROC: 0.8526094577861507
AUPRC: 0.4272744541216901
Hosmer Lemeshow H stat: 4843.628741602765, P-Value: 0.0
Hosmer Lemeshow C stat: 4383.470928389969, P-Value: 0.0
Brier Score: 0.06273102922024416
SMR: 0.5642895468177759
Complexity: 52

Evaluating OOD performance for fasterrisk-40_o...
AUROC: 0.8595056851794944
AUPRC: 0.45390480223340596
Hosmer Lemeshow H stat: 5346.143290595267, P-Value: 0.0
Hosmer Lemeshow C stat: 4516.324534250578, P-Value: 0.0
Brier Score: 0.06186555166675559
SMR: 0.5756947286004948
Complexity: 82



## Union 49 For Other ML Models

In [None]:
for model in ['ebm', 'nonlinear-logreg-l1', 'nonlinear-logreg-l2', 'random-forest', 'adaboost', 'xgboost']:
    ood_evaluate(eicu_df, model=model, exp='union49', save=True)

## OASIS+ Models

In [None]:
for model in ['ebm', 'nonlinear-logreg-l1', 'nonlinear-logreg-l2', 'random-forest', 'adaboost', 'xgboost']:
    ood_evaluate(eicu_df, model=model, exp='oasis+', save=True)

## Severity of Illness Scores

In [4]:
for model in ["oasis_prob", "sapsii_prob", "apache_iv_prob", "apache_iva_prob"]:
    ood_evaluate(eicu_df, model=model, exp='oasis+', save=True)

Evaluating OOD performance for oasis_prob...
AUROC: 0.7971265487226383
AUPRC: 0.31432076859159297
Hosmer Lemeshow H stat: 750.2734354294583, P-Value: 0.0
Hosmer Lemeshow C stat: 656.3111670626704, P-Value: 0.0
Brier Score: 0.06200544638978682
SMR: 0.796869823114373
Complexity: NA

Evaluating OOD performance for sapsii_prob...
AUROC: 0.8399486023254671
AUPRC: 0.39064969070711464
Hosmer Lemeshow H stat: 11438.152654219308, P-Value: 0.0
Hosmer Lemeshow C stat: 10233.383129090587, P-Value: 0.0
Brier Score: 0.07519916612991666
SMR: 0.44575669023662295
Complexity: NA

Evaluating OOD performance for apache_iv_prob...
AUROC: 0.8662356372662006
AUPRC: 0.44011186023324267
Hosmer Lemeshow H stat: 3206.194826806409, P-Value: 0.0
Hosmer Lemeshow C stat: 2811.593961878027, P-Value: 0.0
Brier Score: 0.058772796461723294
SMR: 0.6276517377584122
Complexity: NA

Evaluating OOD performance for apache_iva_prob...
AUROC: 0.8675396503722483
AUPRC: 0.44289852997433604
Hosmer Lemeshow H stat: 1555.02582633134