# Survival Models vs Baseline - MIMIC Dataset - Boostrap Test Set

In [1]:
import pandas as pd
import numpy as np
import yaml
import os
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import plotly.express as px
from recurrent_health_events_prediction.model_analysis.utils import MetricEnum

In [2]:
with open((impresources.files(configs) / 'model_config.yaml')) as f:
    model_config = yaml.safe_load(f)
with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

In [3]:
SAVE_RESULTS_DIR = data_config['final_results']["mimic_path"]

In [4]:
DATASET = "mimic"
BASELINE_SURVIVAL_BASE_DIR = model_config[DATASET]["next_event_model"]["last_event_based"]["save_model_path"]
BASELINE = "baseline_cox_ph_model"

HMM_SURVIVAL_BASE_DIR = model_config[DATASET]["next_event_model"]["hmm_based"]["save_model_path"]
METRIC = MetricEnum.C_INDEX.value
N_BOOTSTRAPS = 5000
RANDOM_STATE = 42
DURATION_COL = "EVENT_DURATION"
EVENT_COL = "READMISSION_EVENT"
EVENT_ID_COL = "HADM_ID"

In [5]:
filename = f"hmm_survival_vs_{BASELINE}_{DATASET}_{METRIC}.csv"
print(f"Saving results to {os.path.join(SAVE_RESULTS_DIR, filename)}")

Saving results to /workspaces/master-thesis-recurrent-health-events-prediction/results/mimic/hmm_survival_vs_baseline_cox_ph_model_mimic_c_index.csv


## cox_ph_with_hmm_log_time_covs_model

In [10]:
from recurrent_health_events_prediction.model_analysis.utils import get_model_config


model_name = "cox_ph_with_hmm_log_gamma_covs_model"

get_model_config(model_name, HMM_SURVIVAL_BASE_DIR)

{'model_name': 'Cox PH with HMM Log Gamma Covs Model',
 'base_hmm_name': 'hmm_mimic_time_log_gamma',
 'model_type': 'cox_ph',
 'save_model_path': '/workspaces/master-thesis-recurrent-health-events-prediction/_models/mimic/hmm_survival',
 'strata_col': None,
 'event_col': 'READMISSION_EVENT',
 'event_id_col': 'HADM_ID',
 'duration_col': 'EVENT_DURATION',
 'cluster_col': None,
 'event_name': 'readmission',
 'features': ['AGE',
  'DISCHARGE_LOCATION_POST_ACUTE_CARE',
  'LOG_DAYS_IN_ICU',
  'CHARLSON_INDEX',
  'HAS_CONGESTIVE_HF',
  'PROB_HIDDEN_RISK_HIGH',
  'PROB_HIDDEN_RISK_MEDIUM',
  'PAST_COUNT_HIDDEN_RISK_HIGH',
  'PAST_COUNT_HIDDEN_RISK_MEDIUM',
  'PAST_COUNT_HIDDEN_RISK_LOW',
  'RANDOM_FEATURE'],
 'features_not_to_scale': ['HAS_CONGESTIVE_HF',
  'DISCHARGE_LOCATION_POST_ACUTE_CARE'],
 'evaluation_times': [30, 60, 90, 100],
 'main_evaluation_time': 30,
 'scale_features': True,
 'strata_remap': None}

In [11]:
from recurrent_health_events_prediction.model_analysis.utils import get_partial_hazard_pred_df, get_compare_partial_hazard_df

partial_harzards_hmm_df = get_partial_hazard_pred_df(HMM_SURVIVAL_BASE_DIR, model_name, baseline=False)
partial_harzards_baseline_df = get_partial_hazard_pred_df(BASELINE_SURVIVAL_BASE_DIR, BASELINE, baseline=True)
events_df = pd.read_csv("/workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/multiple_hosp_patients/last_events.csv")

compare_partial_hazards_df = get_compare_partial_hazard_df(events_df, partial_harzards_baseline_df, partial_harzards_hmm_df)
compare_partial_hazards_df.head()

Unnamed: 0,baseline_partial_hazard,HADM_ID,hmm_partial_hazard,READMISSION_EVENT,EVENT_DURATION
0,0.570211,182562,0.502803,0,120.0
1,0.709849,125726,0.665041,0,120.0
2,1.110215,195290,1.332535,0,120.0
3,0.988656,155036,1.037472,0,120.0
4,1.179823,160425,0.977822,1,20.544444


In [12]:
from recurrent_health_events_prediction.model_analysis.utils import stratified_bootstrap_delta_cindex_lifelines


results_dict = stratified_bootstrap_delta_cindex_lifelines(compare_partial_hazards_df,
                                                      DURATION_COL, EVENT_COL,
                                                      base_pred_col="baseline_partial_hazard",
                                                      hmm_pred_col="hmm_partial_hazard",
                                                      n_boot=N_BOOTSTRAPS, random_state=RANDOM_STATE)

  0%|          | 0/5000 [00:00<?, ?it/s]

100%|██████████| 5000/5000 [00:10<00:00, 460.87it/s]


In [13]:
results_dict

{'metric': 'c_index',
 'obs_c_base': 0.6396177175906334,
 'obs_c_hmm': 0.6184934405416843,
 'obs_delta': -0.021124277048949036,
 'delta_mean': -0.021220275029422423,
 'ci_low': -0.043720212281982376,
 'ci_high': 0.001382500741884117,
 'p_value': 0.0636,
 'n_boot_kept': 5000,
 'n_boot': 5000,
 'stratified': True}

## All Results

In [32]:
all_results_df_list = [results1_df, results2_df, results3_df, results4_df, results5_df, results6_df]
all_results_df = pd.concat(all_results_df_list, ignore_index=True).sort_values(by="delta_mean")


In [33]:
all_results_df

Unnamed: 0,model_name,n_states,classifier,metric,obs_delta,delta_mean,ci_low,ci_high,p_value,n_boot
4,hmm_mimic_time_log_student_t,3,lgbm,f1,-0.039254,-0.039476,-0.083022,0.002665,0.0688,5000
2,hmm_mimic_time_binary_30_days,4,lgbm,f1,-0.02878,-0.029314,-0.079294,0.020821,0.2572,5000
0,hmm_mimic_time_log_normal_v1,3,lgbm,f1,-0.014757,-0.015006,-0.071794,0.040291,0.5956,5000
5,hmm_mimic_time_log_normal_v2,5,lgbm,f1,-0.002789,-0.002792,-0.055295,0.052265,0.9056,5000
1,hmm_mimic_time_log_gamma,3,lgbm,f1,0.005263,0.005271,-0.039579,0.050387,0.8256,5000
3,hmm_mimic_time_log_normal_min_emissions,4,lgbm,f1,0.005583,0.005502,-0.048996,0.058865,0.8536,5000


In [34]:
if not os.path.exists(SAVE_RESULTS_DIR):
    os.makedirs(SAVE_RESULTS_DIR, exist_ok=True)
all_results_df.to_csv(os.path.join(SAVE_RESULTS_DIR, filename), index=False)