# Model Performance - MIMIC Dataset

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import plotly.express as px
from recurrent_health_events_prediction.model_analysis.utils import subset_metric_report, add_pred_cols, bin_numeric_column, stratified_bootstrap_delta
from sklearn.metrics import classification_report, roc_auc_score, balanced_accuracy_score

In [2]:
with open((impresources.files(configs) / 'model_config.yaml')) as f:
    model_config = yaml.safe_load(f)
with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

In [3]:
DATASET = "mimic"
BASELINE_RESULTS_DIR = "/workspaces/master-thesis-recurrent-health-events-prediction/_models/mimic/classifiers_baselines"
event_id_col = "HADM_ID"


## Import Model and Predicted Probs

In [4]:
model_name = "hmm_mimic_time_log_normal"
base_model_dir = model_config[DATASET]["hidden_markov"]["save_model_path"]
model_dir = os.path.join(base_model_dir, model_name)
config_path = os.path.join(model_dir, f"{model_name}_config.yaml")
with open(config_path, "r") as f:
    hmm_config = yaml.safe_load(f)

In [5]:
probs_pred_hmm_feat_df = pd.read_csv(os.path.join(model_dir, "prob_predictions.csv"))
probs_pred_baseline_df = pd.read_csv(os.path.join(BASELINE_RESULTS_DIR, "prob_predictions.csv"))
full_training_df = pd.read_csv(os.path.join(model_dir, "last_events_with_hidden_states.csv")).drop(columns=["index"])

pred_cols = [col for col in probs_pred_hmm_feat_df.columns if col.startswith("y_pred_")]
pred_cols_hmm_renamed = ['hmm_' + col for col in pred_cols]
pred_cols_baseline_renamed = ['baseline_' + col for col in pred_cols]

probs_pred_hmm_feat_df.rename(columns=dict(zip(pred_cols, pred_cols_hmm_renamed)), inplace=True)
probs_pred_baseline_df.rename(columns=dict(zip(pred_cols, pred_cols_baseline_renamed)), inplace=True)

In [6]:
print("Number of rows in full training data:", len(full_training_df))
print("Number of rows in HMM-based features predictions set:", len(probs_pred_hmm_feat_df))
print("Number of rows in baseline predictions set:", len(probs_pred_baseline_df))

Number of rows in full training data: 1369
Number of rows in HMM-based features predictions set: 274
Number of rows in baseline predictions set: 274


In [7]:
full_training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369 entries, 0 to 1368
Data columns (total 64 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SUBJECT_ID                            1369 non-null   int64  
 1   HADM_ID                               1369 non-null   int64  
 2   ADMITTIME                             1369 non-null   object 
 3   DISCHTIME                             1369 non-null   object 
 4   ADMISSION_TYPE                        1369 non-null   object 
 5   ETHNICITY                             1369 non-null   object 
 6   INSURANCE                             1369 non-null   object 
 7   HOSPITALIZATION_DAYS                  1369 non-null   float64
 8   NUM_COMORBIDITIES                     1369 non-null   int64  
 9   TYPES_COMORBIDITIES                   1369 non-null   object 
 10  HAS_DIABETES                          1369 non-null   bool   
 11  HAS_COPD         

In [8]:
probs_pred_hmm_feat_df.head()

Unnamed: 0,sample_id,y_true,hmm_y_pred_proba_logreg,hmm_y_pred_proba_rf,hmm_y_pred_proba_lgbm
0,182562,0,0.355552,0.359547,0.032618
1,125726,0,0.375518,0.366875,0.029622
2,195290,0,0.50698,0.489602,0.190586
3,155036,0,0.463604,0.464418,0.191892
4,160425,1,0.523131,0.53488,0.256178


In [9]:
probs_pred_baseline_df.head()

Unnamed: 0,sample_id,y_true,baseline_y_pred_proba_logreg,baseline_y_pred_proba_rf,baseline_y_pred_proba_lgbm
0,182562,0,0.429645,0.368082,0.089334
1,125726,0,0.439598,0.381308,0.148035
2,195290,0,0.518121,0.524725,0.088933
3,155036,0,0.477996,0.499453,0.076492
4,160425,1,0.514006,0.455528,0.199213


## Comparison Dataframe and Sub-Groups to Analyze

In [10]:
subgroup_cols = [
    "NUM_COMORBIDITIES",
    "NUM_PREV_HOSPITALIZATIONS",
    "READM_30_DAYS_PAST_SUM",
    "LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN",
    "HAS_COPD",
    "HAS_DIABETES",
    "HAS_CONGESTIVE_HF"
]

comparison_df = pd.merge(
    probs_pred_hmm_feat_df,
    full_training_df[[event_id_col] + subgroup_cols],
    left_on="sample_id",
    right_on=event_id_col,
    how="left",
).drop(columns=[event_id_col])

comparison_df = pd.merge(
    comparison_df,
    probs_pred_baseline_df.drop(columns=["y_true"]),
    on="sample_id",
    how="inner",
)

In [11]:
comparison_df.columns

Index(['sample_id', 'y_true', 'hmm_y_pred_proba_logreg', 'hmm_y_pred_proba_rf',
       'hmm_y_pred_proba_lgbm', 'NUM_COMORBIDITIES',
       'NUM_PREV_HOSPITALIZATIONS', 'READM_30_DAYS_PAST_SUM',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN', 'HAS_COPD', 'HAS_DIABETES',
       'HAS_CONGESTIVE_HF', 'baseline_y_pred_proba_logreg',
       'baseline_y_pred_proba_rf', 'baseline_y_pred_proba_lgbm'],
      dtype='object')

In [12]:
comparison_df = add_pred_cols(comparison_df, 0.5)
comparison_df.head()

Unnamed: 0,sample_id,y_true,hmm_y_pred_proba_logreg,hmm_y_pred_proba_rf,hmm_y_pred_proba_lgbm,NUM_COMORBIDITIES,NUM_PREV_HOSPITALIZATIONS,READM_30_DAYS_PAST_SUM,LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN,HAS_COPD,...,HAS_CONGESTIVE_HF,baseline_y_pred_proba_logreg,baseline_y_pred_proba_rf,baseline_y_pred_proba_lgbm,hmm_y_pred_logreg,hmm_y_pred_rf,hmm_y_pred_lgbm,baseline_y_pred_logreg,baseline_y_pred_rf,baseline_y_pred_lgbm
0,182562,0,0.355552,0.359547,0.032618,2,1,0.0,5.410802,False,...,False,0.429645,0.368082,0.089334,0,0,0,0,0,0
1,125726,0,0.375518,0.366875,0.029622,4,1,0.0,7.290958,False,...,False,0.439598,0.381308,0.148035,0,0,0,0,0,0
2,195290,0,0.50698,0.489602,0.190586,2,2,1.0,4.384697,True,...,False,0.518121,0.524725,0.088933,1,0,0,1,1,0
3,155036,0,0.463604,0.464418,0.191892,3,1,0.0,5.560658,False,...,True,0.477996,0.499453,0.076492,0,0,0,0,0,0
4,160425,1,0.523131,0.53488,0.256178,6,2,1.0,5.425724,False,...,True,0.514006,0.455528,0.199213,1,1,0,1,0,0


In [13]:
from pandas.api.types import is_numeric_dtype, is_bool_dtype

for col in subgroup_cols:
    if is_numeric_dtype(comparison_df[col]) and not is_bool_dtype(comparison_df[col]):
        comparison_df = bin_numeric_column(comparison_df, col, num_bins=3, strategy="quantile")

In [14]:
subgroup_cols = [col for col in comparison_df.columns if col.endswith("_SUBGROUP")]
subgroup_cols += ["HAS_COPD", "HAS_DIABETES", "HAS_CONGESTIVE_HF"]
subgroup_cols

['NUM_COMORBIDITIES_SUBGROUP',
 'NUM_PREV_HOSPITALIZATIONS_SUBGROUP',
 'READM_30_DAYS_PAST_SUM_SUBGROUP',
 'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP',
 'HAS_COPD',
 'HAS_DIABETES',
 'HAS_CONGESTIVE_HF']

In [15]:
model_to_analyze = "rf" # Change this to the model you want to analyze 'logreg', 'lgbm', 'rf'

cols = ['sample_id', 'y_true'] +  subgroup_cols + [col for col in comparison_df.columns if model_to_analyze in col]
comparison_model_df = comparison_df[cols]
comparison_model_df.head()

Unnamed: 0,sample_id,y_true,NUM_COMORBIDITIES_SUBGROUP,NUM_PREV_HOSPITALIZATIONS_SUBGROUP,READM_30_DAYS_PAST_SUM_SUBGROUP,LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP,HAS_COPD,HAS_DIABETES,HAS_CONGESTIVE_HF,hmm_y_pred_proba_rf,baseline_y_pred_proba_rf,hmm_y_pred_rf,baseline_y_pred_rf
0,182562,0,"(0.999, 3.0]","(0.999, 2.0]","(-0.001, 1.0]","(4.332, 5.579]",False,False,False,0.359547,0.368082,0,0
1,125726,0,"(3.0, 4.0]","(0.999, 2.0]","(-0.001, 1.0]","(5.579, 8.126]",False,False,False,0.366875,0.381308,0,0
2,195290,0,"(0.999, 3.0]","(0.999, 2.0]","(-0.001, 1.0]","(4.332, 5.579]",True,False,False,0.489602,0.524725,0,1
3,155036,0,"(0.999, 3.0]","(0.999, 2.0]","(-0.001, 1.0]","(4.332, 5.579]",False,False,True,0.464418,0.499453,0,0
4,160425,1,"(4.0, 8.0]","(0.999, 2.0]","(-0.001, 1.0]","(4.332, 5.579]",False,False,True,0.53488,0.455528,1,0


In [16]:
comparison_model_df.columns

Index(['sample_id', 'y_true', 'NUM_COMORBIDITIES_SUBGROUP',
       'NUM_PREV_HOSPITALIZATIONS_SUBGROUP', 'READM_30_DAYS_PAST_SUM_SUBGROUP',
       'LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP', 'HAS_COPD',
       'HAS_DIABETES', 'HAS_CONGESTIVE_HF', 'hmm_y_pred_proba_rf',
       'baseline_y_pred_proba_rf', 'hmm_y_pred_rf', 'baseline_y_pred_rf'],
      dtype='object')

In [17]:
hmm_pred_proba_col = "hmm_y_pred_proba_" + model_to_analyze
baseline_pred_proba_col = "baseline_y_pred_proba_" + model_to_analyze
hmm_pred_col = "hmm_y_pred_" + model_to_analyze
baseline_pred_col = "baseline_y_pred_" + model_to_analyze

## Results

In [18]:
print("AUC HMM-based features:", roc_auc_score(comparison_model_df['y_true'], comparison_model_df[hmm_pred_proba_col]))
print("Weighted Acc. Baseline model:", balanced_accuracy_score(comparison_model_df['y_true'], comparison_model_df[hmm_pred_col]))
print("HMM-based features classification report:")
print(classification_report(comparison_model_df['y_true'], comparison_model_df[hmm_pred_col], zero_division=0))

AUC HMM-based features: 0.6511482689238881
Weighted Acc. Baseline model: 0.6194754735308403
HMM-based features classification report:
              precision    recall  f1-score   support

           0       0.81      0.69      0.75       203
           1       0.38      0.55      0.45        71

    accuracy                           0.65       274
   macro avg       0.60      0.62      0.60       274
weighted avg       0.70      0.65      0.67       274



In [19]:
print("AUC Baseline model:", roc_auc_score(comparison_model_df['y_true'], comparison_model_df[baseline_pred_proba_col]))
print("Weighted Acc. Baseline model:", balanced_accuracy_score(comparison_model_df['y_true'], comparison_model_df[baseline_pred_col]))
print("Baseline classification report:")
print(classification_report(comparison_model_df['y_true'], comparison_model_df[baseline_pred_col], zero_division=0))

AUC Baseline model: 0.6480954693679317
Weighted Acc. Baseline model: 0.6046971484076875
Baseline classification report:
              precision    recall  f1-score   support

           0       0.81      0.66      0.73       203
           1       0.36      0.55      0.44        71

    accuracy                           0.63       274
   macro avg       0.58      0.60      0.58       274
weighted avg       0.69      0.63      0.65       274



In [20]:
stratified_bootstrap_delta(
    df=comparison_model_df,
    y_col="y_true",
    base_col=baseline_pred_proba_col,
    hmm_col=hmm_pred_proba_col,
    metric="auc",
    n_boot=5000,
    threshold=0.5,
)

100%|██████████| 5000/5000 [00:09<00:00, 553.70it/s]


{'metric': 'auc',
 'obs_delta': 0.003052799555956387,
 'delta_mean': 0.0030540900575869046,
 'ci_low': -0.0280996322764171,
 'ci_high': 0.035386456671060856,
 'p_value': 0.8696,
 'n_boot': 5000}

In [21]:
stratified_bootstrap_delta(
    df=comparison_model_df,
    y_col="y_true",
    base_col=baseline_pred_proba_col,
    hmm_col=hmm_pred_proba_col,
    metric="acc",
    n_boot=5000,
    threshold=0.5,
)

100%|██████████| 5000/5000 [00:05<00:00, 951.06it/s]


{'metric': 'acc',
 'obs_delta': 0.014778325123152802,
 'delta_mean': 0.014914917088739333,
 'ci_low': -0.027822105044057466,
 'ci_high': 0.05984180947755502,
 'p_value': 0.5112,
 'n_boot': 5000}

### NUM_COMORBIDITIES_SUBGROUP

In [22]:
group_col = "NUM_COMORBIDITIES_SUBGROUP"
subset_metric_report(comparison_model_df, group_by=group_col, metric="f1",
                     y_col="y_true",
                     base_col=baseline_pred_col, hmm_col=hmm_pred_col)

  df.groupby(group_by, dropna=False, group_keys=False)


Unnamed: 0,NUM_COMORBIDITIES_SUBGROUP,n,f1_baseline,f1_hmm,delta
0,"(0.999, 3.0]",138.0,0.677046,0.712472,0.035426
1,"(3.0, 4.0]",53.0,0.604182,0.569468,-0.034714
2,"(4.0, 8.0]",83.0,0.632412,0.655035,0.022623


### NUM_PREV_HOSPITALIZATIONS_SUBGROUP

In [27]:
group_col = "NUM_PREV_HOSPITALIZATIONS_SUBGROUP"
subset_metric_report(comparison_model_df, group_by=group_col, metric="f1",
                        y_col="y_true",
                        base_col=baseline_pred_col, hmm_col=hmm_pred_col)

  df.groupby(group_by, dropna=False, group_keys=False)


Unnamed: 0,NUM_PREV_HOSPITALIZATIONS_SUBGROUP,n,f1_baseline,f1_hmm,delta
0,"(0.999, 2.0]",203.0,0.682937,0.691478,0.008541
1,"(2.0, 10.0]",71.0,0.560478,0.610954,0.050476


In [28]:
group_categories = comparison_model_df[group_col].cat.categories
print("Group categories:", group_categories)

Group categories: IntervalIndex([(0.999, 2.0], (2.0, 10.0]], dtype='interval[float64, right]')


In [29]:
group_selected = group_categories[-1]  # Change this to the group you want to analyze
print("Selected group for analysis:", group_selected)

Selected group for analysis: (2.0, 10.0]


In [30]:
subset_group_df = comparison_model_df[comparison_model_df[group_col] == group_selected]
subset_group_df.head()

Unnamed: 0,sample_id,y_true,NUM_COMORBIDITIES_SUBGROUP,NUM_PREV_HOSPITALIZATIONS_SUBGROUP,READM_30_DAYS_PAST_SUM_SUBGROUP,LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP,HAS_COPD,HAS_DIABETES,HAS_CONGESTIVE_HF,hmm_y_pred_proba_rf,baseline_y_pred_proba_rf,hmm_y_pred_rf,baseline_y_pred_rf
9,110184,0,"(3.0, 4.0]","(2.0, 10.0]","(-0.001, 1.0]","(4.332, 5.579]",True,False,True,0.578648,0.578617,1,1
10,171650,0,"(0.999, 3.0]","(2.0, 10.0]","(-0.001, 1.0]","(4.332, 5.579]",False,False,False,0.493887,0.487467,0,0
11,148880,0,"(4.0, 8.0]","(2.0, 10.0]","(-0.001, 1.0]","(5.579, 8.126]",True,True,True,0.424162,0.387744,0,0
16,184536,0,"(4.0, 8.0]","(2.0, 10.0]","(-0.001, 1.0]","(5.579, 8.126]",False,False,True,0.472011,0.494676,0,0
23,175797,0,"(3.0, 4.0]","(2.0, 10.0]","(-0.001, 1.0]","(1.465, 4.332]",True,False,False,0.462719,0.413909,0,0


In [31]:
stratified_bootstrap_delta(
    df=subset_group_df,
    y_col="y_true",
    base_col=baseline_pred_proba_col,
    hmm_col=hmm_pred_proba_col,
    metric="f1",
    n_boot=5000,
    threshold=0.5,
)

  0%|          | 0/5000 [00:00<?, ?it/s]

100%|██████████| 5000/5000 [00:12<00:00, 414.01it/s]


{'metric': 'f1',
 'obs_delta': 0.050476117575015245,
 'delta_mean': 0.05037779152727397,
 'ci_low': -0.004312480489025315,
 'ci_high': 0.11482528087726279,
 'p_value': 0.1156,
 'n_boot': 5000}

### LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP

In [24]:
group_col = "LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP"
subset_metric_report(comparison_model_df, group_by=group_col, metric="f1",
                     y_col="y_true",
                     base_col=baseline_pred_col, hmm_col=hmm_pred_col)

  df.groupby(group_by, dropna=False, group_keys=False)


Unnamed: 0,LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP,n,f1_baseline,f1_hmm,delta
0,"(1.465, 4.332]",92.0,0.495745,0.51625,0.020505
1,"(4.332, 5.579]",91.0,0.659995,0.710014,0.050018
2,"(5.579, 8.126]",91.0,0.739509,0.739509,0.0


### READM_30_DAYS_PAST_SUM_SUBGROUP

In [32]:
group_col = "READM_30_DAYS_PAST_SUM_SUBGROUP"
subset_metric_report(comparison_model_df, group_by=group_col, metric="f1",
                     y_col="y_true",
                     base_col=baseline_pred_col, hmm_col=hmm_pred_col)

  df.groupby(group_by, dropna=False, group_keys=False)


Unnamed: 0,READM_30_DAYS_PAST_SUM_SUBGROUP,n,f1_baseline,f1_hmm,delta
0,"(-0.001, 1.0]",254.0,0.663035,0.679524,0.016489
1,"(1.0, 5.0]",20.0,0.329345,0.417582,0.088238


In [33]:
group_categories = comparison_model_df[group_col].cat.categories
print("Group categories:", group_categories)

Group categories: IntervalIndex([(-0.001, 1.0], (1.0, 5.0]], dtype='interval[float64, right]')


In [34]:
group_selected = group_categories[-1]  # Change this to the group you want to analyze
print("Selected group for analysis:", group_selected)

Selected group for analysis: (1.0, 5.0]


In [35]:
subset_group_df = comparison_model_df[comparison_model_df[group_col] == group_selected]
subset_group_df.head()

Unnamed: 0,sample_id,y_true,NUM_COMORBIDITIES_SUBGROUP,NUM_PREV_HOSPITALIZATIONS_SUBGROUP,READM_30_DAYS_PAST_SUM_SUBGROUP,LOG_DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN_SUBGROUP,HAS_COPD,HAS_DIABETES,HAS_CONGESTIVE_HF,hmm_y_pred_proba_rf,baseline_y_pred_proba_rf,hmm_y_pred_rf,baseline_y_pred_rf
25,190483,0,"(0.999, 3.0]","(2.0, 10.0]","(1.0, 5.0]","(1.465, 4.332]",False,False,True,0.628797,0.632307,1,1
27,119090,0,"(0.999, 3.0]","(2.0, 10.0]","(1.0, 5.0]","(4.332, 5.579]",False,False,True,0.546405,0.507347,1,1
30,154157,0,"(3.0, 4.0]","(2.0, 10.0]","(1.0, 5.0]","(1.465, 4.332]",False,True,True,0.607789,0.624346,1,1
36,142808,0,"(0.999, 3.0]","(2.0, 10.0]","(1.0, 5.0]","(4.332, 5.579]",False,True,True,0.479676,0.439605,0,0
41,148246,0,"(0.999, 3.0]","(2.0, 10.0]","(1.0, 5.0]","(1.465, 4.332]",True,True,False,0.51323,0.532178,1,1


In [37]:
stratified_bootstrap_delta(
    df=subset_group_df,
    y_col="y_true",
    base_col=baseline_pred_proba_col,
    hmm_col=hmm_pred_proba_col,
    metric="f1",
    n_boot=10000,
    threshold=0.5,
)

100%|██████████| 10000/10000 [00:23<00:00, 421.50it/s]


{'metric': 'f1',
 'obs_delta': 0.08823768823768824,
 'delta_mean': 0.08537313884796849,
 'ci_low': 0.0,
 'ci_high': 0.26742857142857157,
 'p_value': 0.7124,
 'n_boot': 10000}