# Fairness Evaluations on eICU

In [None]:
import mimic_pipeline.utils as utils
import pandas as pd
import numpy as np

user = input("Enter your username: ")
password = input("Enter your password: ")
loader = utils.DataBaseLoader(user=user, password=password, dbname='eicu', schema='eicu')
patient_df = loader['patient']
patient_df.head()

Use dataset not chosen for calibration

In [None]:
eicu_df = pd.read_csv("data/eICU-union-noncali.csv")
print(eicu_df.shape)
eicu_df.head()

In [None]:
eicu_df = eicu_df.merge(patient_df[['gender', 'ethnicity', 'patientunitstayid', 'uniquepid']], on=['patientunitstayid', 'uniquepid'], how='left')
print(eicu_df.shape)
eicu_df.head()

In [None]:
eicu_df['ethnicity'].unique()

In [None]:
eicu_df['gender'].unique()

In [None]:
print(f"Unknown Gender: {len(eicu_df[eicu_df['gender'] == 'Unknown'])}")
print(f"Other Gender: {len(eicu_df[eicu_df['gender'] == 'Other'])}")

In [None]:
import joblib
from mimic_pipeline.metric import get_calibration_curve, compute_all_metrics
from sklearn.metrics import roc_curve, precision_recall_curve, auc, brier_score_loss

def metrics_across_groups(eicu_df, method, group_sparsity, exp, type, calibrate=False, rounded=False):
    assert type in ['ethnicity', 'gender']
    if type == 'ethnicity':
        groups = ['Caucasian', 'African American', 'Asian', 'Hispanic', 'Other/Unknown', 'Native American']
    else:
        groups = ['Female', 'Male']         # exclude other and unknown since they are very few
    
    for group in groups:
        subgroup_df = eicu_df[eicu_df[type] == group]
        print(f"There are {len(subgroup_df)} patients in {group} group ({len(subgroup_df) / len(eicu_df):.2%})")

        if method == 'OASIS':
            y_prob = subgroup_df['oasis_prob']
            score_name = "oasis_prob"
        elif method == 'SAPS II':
            y_prob = subgroup_df['sapsii_prob']
            score_name = "sapsii_prob"
        elif method == 'APACHE IV':
            y_prob = subgroup_df['apache_iv_prob']
            score_name = "apache_iv_prob"
        elif method == 'APACHE IVa':
            y_prob = subgroup_df['apache_iva_prob']
            score_name = "apache_iva_prob"
            
        X_test, y_test = subgroup_df.drop(['uniquepid', 'patientunitstayid', 'hospital_expire_flag', 'apache_iv_prob', 'apache_iva_prob', 'oasis_prob', 'sapsii_prob', 'ethnicity', 'gender'], axis=1), subgroup_df['hospital_expire_flag']
        
        if method == 'fasterrisk':
            assert group_sparsity is not None
            fasterrisk = joblib.load(f"models/fasterrisk/fasterrisk-{group_sparsity}")
            binarizer = joblib.load(f"models/fasterrisk/fasterrisk-{group_sparsity}-binarizer")
            calibrator = joblib.load(f"models/fasterrisk/fasterrisk-{group_sparsity}-calibrator")
            X_test, _ = binarizer.transform(X_test)
            y_prob = calibrator.transform(fasterrisk.predict_proba(X_test))
        elif method in ['OASIS', 'SAPS II', 'APACHE IV', 'APACHE IVa']:
            if calibrate:
                calibrator = joblib.load(f"models/{score_name}-calibrator")
                y_prob = calibrator.transform(y_prob)
            pass
        else:
            assert exp is not None
            model = joblib.load(f"models/{exp}/{method}")
            scaler = joblib.load(f"models/{exp}/{method}-scaler")
            imputer = joblib.load(f"models/{exp}/{method}-imputer")
            columns = list(X_test.columns)
            X_test = imputer.transform(X_test)
            X_test = pd.DataFrame(X_test, columns=columns)
            X_test = scaler.transform(X_test)
            y_prob = utils.adapt_proba(model.predict_proba(X_test))
        
        metrics = compute_all_metrics(y_test, y_prob)
        print(f"***** {group} *****")
        if rounded:
            print(f"AUROC: {metrics.auroc:.3f}, AUPRC: {metrics.auprc:.3f}\nBrier Score: {metrics.brier:.3f}\nHosmer-Lemeshow H statistics: {metrics.H:.2f}, p: {metrics.p_h:.5}\nHosmer-Lemeshow C statistics: {metrics.C:.2f}, p: {metrics.p_c:.5}\nSMR: {metrics.smr:.3f}\n")
        else:
            print(f"AUROC: {metrics.auroc}, AUPRC: {metrics.auprc}\nBrier Score: {metrics.brier}\nHosmer-Lemeshow H statistics: {metrics.H}, p: {metrics.p_h}\nHosmer-Lemeshow C statistics: {metrics.C}, p: {metrics.p_c}\nSMR: {metrics.smr}\n")

## FasterRisk: Group Sparsity 40

In [None]:
metrics_across_groups(eicu_df, method='fasterrisk', exp=None, group_sparsity=40, type='gender')
metrics_across_groups(eicu_df, method='fasterrisk', exp=None, group_sparsity=40, type='ethnicity')

## FasterRisk: Group Sparsity 15

In [None]:
metrics_across_groups(eicu_df, method='fasterrisk', exp=None, group_sparsity=15, type='gender')
metrics_across_groups(eicu_df, method='fasterrisk', exp=None, group_sparsity=15, type='ethnicity')

## FasterRisk: Group Sparsity 10

In [None]:
metrics_across_groups(eicu_df, method='fasterrisk', exp=None, group_sparsity=10, type='gender')
metrics_across_groups(eicu_df, method='fasterrisk', exp=None, group_sparsity=10, type='ethnicity')

## OASIS Calibrated

In [None]:
metrics_across_groups(eicu_df, method='OASIS', exp=None, group_sparsity=10, type='gender', calibrate=True)
metrics_across_groups(eicu_df, method='OASIS', exp=None, group_sparsity=10, type='ethnicity', calibrate=True)

## SAPS II Calibrated

In [None]:
metrics_across_groups(eicu_df, method='SAPS II', exp=None, group_sparsity=10, type='gender', calibrate=True)
metrics_across_groups(eicu_df, method='SAPS II', exp=None, group_sparsity=10, type='ethnicity', calibrate=True)

## APACHE IV Calibrated

In [None]:
metrics_across_groups(eicu_df, method='APACHE IV', exp=None, group_sparsity=10, type='gender', calibrate=True)
metrics_across_groups(eicu_df, method='APACHE IV', exp=None, group_sparsity=10, type='ethnicity', calibrate=True)

## APACHE IVa Calibrated

In [None]:
metrics_across_groups(eicu_df, method='APACHE IVa', exp=None, group_sparsity=10, type='gender', calibrate=True)
metrics_across_groups(eicu_df, method='APACHE IVa', exp=None, group_sparsity=10, type='ethnicity', calibrate=True)

## OASIS

In [None]:
metrics_across_groups(eicu_df, method='OASIS', exp=None, group_sparsity=10, type='gender')
metrics_across_groups(eicu_df, method='OASIS', exp=None, group_sparsity=10, type='ethnicity')

## SAPS II

In [None]:
metrics_across_groups(eicu_df, method='SAPS II', exp=None, group_sparsity=10, type='gender')
metrics_across_groups(eicu_df, method='SAPS II', exp=None, group_sparsity=10, type='ethnicity')

## APACHE IV

In [None]:
metrics_across_groups(eicu_df, method='APACHE IV', exp=None, group_sparsity=10, type='gender')
metrics_across_groups(eicu_df, method='APACHE IV', exp=None, group_sparsity=10, type='ethnicity')

## APACHE IVa

In [None]:
metrics_across_groups(eicu_df, method='APACHE IVa', exp=None, group_sparsity=10, type='gender')
metrics_across_groups(eicu_df, method='APACHE IVa', exp=None, group_sparsity=10, type='ethnicity')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from mimic_pipeline.metric import get_calibration_curve, compute_all_metrics

plt.rcParams['figure.dpi'] = 300

def plot_metrics_across_groups(eicu_df, group_sparsity, figsize=(15,5), fontsize=14, linewidth=1, markersize=5):
    sns.set_style('ticks')
    fig, axes = plt.subplots(1, 3, figsize=figsize)
    sns.lineplot(x=np.linspace(0,1), y=np.linspace(0,1), color='black', linestyle='--', linewidth=linewidth, ax=axes[0])
    sns.lineplot(x=np.linspace(0,1), y=np.linspace(0,1), color='black', linestyle='--', linewidth=linewidth, ax=axes[2])
    fasterrisk = joblib.load(f"models/fasterrisk/fasterrisk-{group_sparsity}")
    binarizer = joblib.load(f"models/fasterrisk/fasterrisk-{group_sparsity}-binarizer")
    calibrator = joblib.load(f"models/fasterrisk/fasterrisk-{group_sparsity}-calibrator")
    
    for gender in ['Female', 'Male']:
        subgroup_df = eicu_df[eicu_df['gender'] == gender]
        X_test, y_test = subgroup_df.drop(['uniquepid', 'patientunitstayid', 'hospital_expire_flag', 'apache_iv_prob', 'apache_iva_prob', 'oasis_prob', 'sapsii_prob', 'ethnicity', 'gender'], axis=1), subgroup_df['hospital_expire_flag']
        X_test, _ = binarizer.transform(X_test)
        y_prob = calibrator.transform(fasterrisk.predict_proba(X_test.to_numpy()))
        metrics = compute_all_metrics(y_test, y_prob)
        sns.lineplot(x=metrics.fpr, y=metrics.tpr, label=gender, linewidth=linewidth, ax=axes[0], errorbar=None)
        sns.lineplot(x=metrics.recall, y=metrics.precision, label=gender, linewidth=linewidth, ax=axes[1], errorbar=None)
        sns.lineplot(x=metrics.prob_true, y=metrics.prob_pred, label=gender, linewidth=linewidth, ax=axes[2], marker='s', markersize=markersize)
    
    for ethnicity in ['Caucasian', 'African American', 'Asian', 'Hispanic', 'Other/Unknown', 'Native American']:
        subgroup_df = eicu_df[eicu_df['ethnicity'] == ethnicity]
        X_test, y_test = subgroup_df.drop(['uniquepid', 'patientunitstayid', 'hospital_expire_flag', 'apache_iv_prob', 'apache_iva_prob', 'oasis_prob', 'sapsii_prob', 'ethnicity', 'gender'], axis=1), subgroup_df['hospital_expire_flag']
        X_test, _ = binarizer.transform(X_test)
        y_prob = calibrator.transform(fasterrisk.predict_proba(X_test.to_numpy()))
        metrics = compute_all_metrics(y_test, y_prob)
        sns.lineplot(x=metrics.fpr, y=metrics.tpr, label=ethnicity, linewidth=linewidth, ax=axes[0], errorbar=None)
        sns.lineplot(x=metrics.recall, y=metrics.precision, label=ethnicity, linewidth=linewidth, ax=axes[1], errorbar=None)
        sns.lineplot(x=metrics.prob_true, y=metrics.prob_pred, label=ethnicity, linewidth=linewidth, ax=axes[2], marker='s', markersize=markersize)
    
    axes[0].set_ylabel("True Positive Rate", fontsize=fontsize)
    axes[0].set_xlabel("False Positive Rate", fontsize=fontsize)
    axes[0].tick_params(axis='both', which='major', labelsize=fontsize)
    axes[0].get_legend().remove()
    
    axes[1].set_ylabel("Precision", fontsize=fontsize)
    axes[1].set_xlabel("Recall", fontsize=fontsize)
    axes[1].tick_params(axis='both', which='major', labelsize=fontsize)
    axes[1].get_legend().remove()
    
    axes[2].set_ylabel("Predicted Probability", fontsize=fontsize)
    axes[2].set_xlabel("True Probability", fontsize=fontsize)
    axes[2].tick_params(axis='both', which='major', labelsize=fontsize)
    axes[2].legend(fontsize=fontsize-3)
    
    fig.tight_layout()
    # plt.show()

for e in [10, 15, 40]:
    plot_metrics_across_groups(eicu_df, group_sparsity=e, figsize=(21, 7), linewidth=1.5, markersize=8, fontsize=19)
    plt.savefig(f'fairness-{e}.pdf', dpi=300, format='pdf')
    plt.close()