# Deep Learning Models Performance by Subgroup - MIMIC Dataset

In [1]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml

import pandas as pd

from sklearn.metrics import roc_auc_score, f1_score

In [3]:
MULTIPLE_HOSP_PATIENTS = False
MODEL_RUN_NAME = "attention_pooling_query_curr_20251108_125957"


In [4]:
data_config_path = (impresources.files(configs) / "data_config.yaml")

with open(data_config_path) as f:
    data_config = yaml.safe_load(f)

runs_dir = data_config["training_data"]["mimic"]["tensorboard_log_dir"]

if MULTIPLE_HOSP_PATIENTS:
    data_dir = data_config["training_data"]["mimic"]["data_directory_multiple_hosp_patients"]
else:
    data_dir = data_config["training_data"]["mimic"]["data_directory"]
    
test_sample_csv_path = f"{data_dir}/test.csv"
train_sample_csv_path = f"{data_dir}/train_full.csv"

print(f"Train samples CSV path: {train_sample_csv_path}")
print(f"Test samples CSV path: {test_sample_csv_path}")

Train samples CSV path: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/train_full.csv
Test samples CSV path: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/test.csv


In [None]:
test_results_csv_path = f"{runs_dir}/{MODEL_RUN_NAME}/test_predictions.csv"
print(f"Test results CSV path: {test_results_csv_path}")

In [5]:
train_samples_df = pd.read_csv(train_sample_csv_path)
train_samples_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,6.988889,3,"['other', 'congestive_heart_failure', 'myocard...",False,False,True,...,False,False,False,True,False,False,False,False,True,False
1,28,162569,2177-09-01 07:15:00,2177-09-06 16:00:00,5.364583,4,"['other', 'diabetes_without_cc', 'myocardial_i...",True,True,False,...,False,False,False,True,False,False,False,True,True,False
2,32,175413,2170-04-04 08:00:00,2170-04-23 12:45:00,19.197917,2,"['other', 'chronic_pulmonary_disease']",False,True,False,...,False,False,True,False,False,False,False,True,True,False
3,33,176176,2116-12-23 22:30:00,2116-12-27 12:05:00,3.565972,2,"['other', 'chronic_pulmonary_disease']",False,True,False,...,False,False,False,True,False,False,False,False,True,False
4,34,115799,2186-07-18 16:46:00,2186-07-20 16:00:00,1.968056,3,"['other', 'congestive_heart_failure', 'myocard...",False,False,True,...,False,False,False,True,False,False,False,True,True,False


In [17]:
first_hosp_samples_df = train_samples_df[train_samples_df["NUM_PREV_HOSPITALIZATIONS"] == 0]
first_hosp_samples_ids = first_hosp_samples_df["SUBJECT_ID"].tolist()
first_hosp_samples_df.to_csv(
    "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/first_hosp_only/train_first_hosp_only_full.csv",
    index=False,
)

In [13]:
mult_hosp_patients_train_ids = (
    train_samples_df.groupby("SUBJECT_ID")
    .filter(lambda x: len(x) > 1)["SUBJECT_ID"]
    .unique()
)
print(
    f"Number of multiple-hospitalization patients in train set: {len(mult_hosp_patients_train_ids)}"
)

mult_hosp_patients_df = train_samples_df[
    train_samples_df["SUBJECT_ID"].isin(mult_hosp_patients_train_ids)
]
mult_hosp_patients_df.to_csv(
    "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/more_prev_hosp_exp/train_more_prev_hosp_full.csv",
    index=False,
)

Number of multiple-hospitalization patients in train set: 1503


In [14]:
test_samples_df = pd.read_csv(test_sample_csv_path)
test_samples_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,16.364583,2,"['other', 'renal_disease']",False,False,False,...,False,False,False,True,False,False,False,True,True,False
1,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,13.165278,6,"['myocardial_infarct', 'other', 'cerebrovascul...",True,False,True,...,False,False,False,True,False,False,False,True,False,True
2,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,5.013889,2,"['other', 'congestive_heart_failure']",False,False,True,...,True,False,False,True,False,False,False,False,True,False
3,49,190539,2186-11-21 07:15:00,2186-11-28 14:05:00,7.284722,4,"['other', 'congestive_heart_failure', 'myocard...",False,True,True,...,False,False,False,True,False,False,False,True,False,True
4,68,170467,2173-12-15 16:16:00,2174-01-03 18:30:00,19.093056,4,"['other', 'congestive_heart_failure', 'aids', ...",False,False,True,...,False,False,False,True,False,True,False,False,True,False


In [15]:
mult_hosp_patients_test_ids = (
    test_samples_df.groupby("SUBJECT_ID")
    .filter(lambda x: len(x) > 1)["SUBJECT_ID"]
    .unique()
)

print(
    f"Number of multiple-hospitalization patients in test set: {len(mult_hosp_patients_test_ids)}"
)

mult_hosp_patients_df = test_samples_df[
    test_samples_df["SUBJECT_ID"].isin(mult_hosp_patients_test_ids)
]
mult_hosp_patients_df.to_csv(
    "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/more_prev_hosp_exp/test_more_prev_hosp.csv",
    index=False,
)

Number of multiple-hospitalization patients in test set: 375


In [19]:
assert set(mult_hosp_patients_train_ids).isdisjoint(mult_hosp_patients_test_ids), "Train and test sets have overlapping multiple-hospitalization patients!"
assert set(first_hosp_samples_ids).isdisjoint(mult_hosp_patients_test_ids), "Train set has overlapping first-hospitalization and multiple-hospitalization patients!"
assert set(mult_hosp_patients_train_ids).issubset(first_hosp_samples_ids), "Some multiple-hospitalization patients in train set are not in first-hospitalization set!"

In [7]:
test_results_df = pd.read_csv(test_results_csv_path)
test_results_df.head()

Unnamed: 0,sample_id,y_true,y_pred_proba_attention_pooling_query_curr,y_pred_attention_pooling_query_curr
0,107064,0,0.117224,0
1,109451,0,0.159065,0
2,104557,0,0.059859,0
3,190539,0,0.086244,0
4,170467,1,0.21605,0


In [6]:
samples_df_cols = ["HADM_ID", "NUM_PREV_HOSPITALIZATIONS"]
test_df = test_samples_df[samples_df_cols].merge(test_results_df, left_on="HADM_ID", right_on="sample_id")
test_df = test_df.drop(columns=["sample_id"])
test_df.head()

Unnamed: 0,HADM_ID,NUM_PREV_HOSPITALIZATIONS,y_true,y_pred_proba_attention_pooling_query_curr,y_pred_attention_pooling_query_curr
0,107064,0,0,0.117224,0
1,109451,0,0,0.159065,0
2,104557,0,0,0.059859,0
3,190539,0,0,0.086244,0
4,170467,0,1,0.21605,0


In [7]:
y_true_col = "y_true"

y_pred_proba_col = list(filter(lambda col: col.startswith("y_pred_proba"), test_df.columns.to_list()))[0]
y_label_col = test_df.columns[-1] # assuming the last column is the predicted label

print(f"Using proba prediction column: {y_pred_proba_col}")
print(f"Using label prediction column: {y_label_col}")

Using proba prediction column: y_pred_proba_attention_pooling_query_curr
Using label prediction column: y_pred_attention_pooling_query_curr


In [8]:
auc_roc_test = roc_auc_score(test_df[y_true_col], test_df[y_pred_proba_col])
f1_score_test = f1_score(test_df[y_true_col], test_df[y_label_col])
print(f"Overall AUROC on Test Set ({len(test_df)}): {auc_roc_test:.3f}")
print(f"Overall F1 Score on Test Set ({len(test_df)}): {f1_score_test:.3f}")

Overall AUROC on Test Set (2599): 0.745
Overall F1 Score on Test Set (2599): 0.354


In [18]:
no_prev_hosp_df = test_df[test_df["NUM_PREV_HOSPITALIZATIONS"] == 0]
ids_no_prev_hosp = no_prev_hosp_df["HADM_ID"].unique().tolist()
more_prev_hosp_df = test_df[test_df["NUM_PREV_HOSPITALIZATIONS"] > 0]
ids_more_prev_hosp = more_prev_hosp_df["HADM_ID"].unique().tolist()

In [10]:
y_true_mean_no_prev = no_prev_hosp_df[y_true_col].mean()
y_true_mean_more_prev = more_prev_hosp_df[y_true_col].mean()

print("Mean True Values by Subgroup:")
print(f"No Previous Hospitalizations ({len(no_prev_hosp_df)}): {y_true_mean_no_prev:.3f}")
print(f"More Previous Hospitalizations ({len(more_prev_hosp_df)}): {y_true_mean_more_prev:.3f}")

Mean True Values by Subgroup:
No Previous Hospitalizations (2246): 0.057
More Previous Hospitalizations (353): 0.272


In [11]:
auc_no_prev_hosp = roc_auc_score(no_prev_hosp_df[y_true_col], no_prev_hosp_df[y_pred_proba_col])
auc_more_prev_hosp = roc_auc_score(more_prev_hosp_df[y_true_col], more_prev_hosp_df[y_pred_proba_col])

f1_score_no_prev_hosp = f1_score(no_prev_hosp_df[y_true_col], no_prev_hosp_df[y_label_col])
f1_score_more_prev_hosp = f1_score(more_prev_hosp_df[y_true_col], more_prev_hosp_df[y_label_col])

print("AUROC Scores by Subgroup:")
print(f"No Previous Hospitalizations ({len(no_prev_hosp_df)}): {auc_no_prev_hosp:.3f}")
print(f"More Previous Hospitalizations ({len(more_prev_hosp_df)}): {auc_more_prev_hosp:.3f}")

print("\nF1 Scores by Subgroup:")
print(f"No Previous Hospitalizations ({len(no_prev_hosp_df)}): {f1_score_no_prev_hosp:.3f}")
print(f"More Previous Hospitalizations ({len(more_prev_hosp_df)}): {f1_score_more_prev_hosp:.3f}")

AUROC Scores by Subgroup:
No Previous Hospitalizations (2246): 0.657
More Previous Hospitalizations (353): 0.587

F1 Scores by Subgroup:
No Previous Hospitalizations (2246): 0.176
More Previous Hospitalizations (353): 0.442


## Training Done on Same Patients with More Previous Hospitalizations

In [35]:
test_results_path = "/workspaces/msc-thesis-recurrent-health-modeling/_runs/attention_pooling_query_curr_exp_20251108_145706/test_predictions.csv"
train_path_v2 = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/more_prev_hosp_exp/train_more_prev_hosp_full.csv"
train_path = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/train_full.csv"

train_more_prev_hosp_v2_df = pd.read_csv(train_path_v2)
test_more_prev_hosp_results_df = pd.read_csv(test_results_path)
train_df = pd.read_csv(train_path)

In [36]:
ids_in_train_v2 = set(train_more_prev_hosp_v2_df["HADM_ID"].unique().tolist())
ids_in_test_more_prev_hosp_v2 = set(test_more_prev_hosp_results_df["sample_id"].unique().tolist())
original_train_ids_patients_more_prev_hosp = set(train_df[train_df["NUM_PREV_HOSPITALIZATIONS"] > 0]["HADM_ID"].unique().tolist())

In [37]:
assert len(ids_in_train_v2.intersection(ids_in_test_more_prev_hosp_v2)) == 0, "Train and Test sets have overlapping HADM_IDs!"
assert ids_in_test_more_prev_hosp_v2 == set(ids_more_prev_hosp), "Different Test Sets used!"
assert original_train_ids_patients_more_prev_hosp == set(train_more_prev_hosp_v2_df["HADM_ID"].unique().tolist()), "Different Train Sets used!"

In [38]:
test_more_prev_hosp_results_df.head()

Unnamed: 0,sample_id,y_true,y_pred_proba_attention_pooling_query_curr_exp,y_pred_attention_pooling_query_curr_exp
0,188869,0,0.419588,1
1,101651,0,0.240227,1
2,177951,0,0.169948,0
3,100765,0,0.342863,1
4,145911,0,0.366402,1


In [39]:
auc_roc_test_more_prev_hosp_v2 = roc_auc_score(
    test_more_prev_hosp_results_df["y_true"],
    test_more_prev_hosp_results_df["y_pred_proba_attention_pooling_query_curr_exp"]
)
print(f"AUROC on Test Set with More Previous Hospitalizations (v2): {auc_roc_test_more_prev_hosp_v2:.3f}")

AUROC on Test Set with More Previous Hospitalizations (v2): 0.617
