# Train and Evaluate Deep Learning Models for MIMIC Dataset

In [1]:
from importlib import resources as impresources
import os

import torch

from recurrent_health_events_prediction.utils.general_utils import import_yaml_config
from recurrent_health_events_prediction import configs

data_config = import_yaml_config(
    impresources.files(configs) / "data_config.yaml"
)

training_data_config = data_config['training_data']["mimic"]
data_directory = training_data_config['data_directory']

OVERWRITE_PREPROCESSED = True

In [2]:
print("Train and test data directory:", data_directory)

Train and test data directory: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/mimic_cleaned_v2


In [4]:
model_name = "gru_duration_aware"
model_config_path =f"/workspaces/msc-thesis-recurrent-health-modeling/_models/mimic/deep_learning/{model_name}/{model_name}_config.yaml"
 
model_config = import_yaml_config(model_config_path)
for key, value in model_config.items():
    print(f"{key}: {value}")

model_params_dict = model_config['model_params']
assert model_params_dict['input_size_curr'] == len(model_config['current_feat_cols'])
assert model_params_dict['input_size_seq'] == len(model_config['longitudinal_feat_cols'])

model_config_dir_path: /workspaces/msc-thesis-recurrent-health-modeling/_models/mimic/deep_learning/gru_duration_aware
model_class: GRUNet
max_sequence_length: 4
reverse_chronological_order: False
longitudinal_feat_cols: ['LOG_HOSPITALIZATION_DAYS', 'LOG_DAYS_IN_ICU', 'CHARLSON_INDEX', 'NUM_PROCEDURES', 'LOG_NUM_DRUGS', 'DISCHARGE_LOCATION_POST_ACUTE_CARE', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION']
current_feat_cols: ['LOG_HOSPITALIZATION_DAYS', 'LOG_DAYS_IN_ICU', 'CHARLSON_INDEX', 'NUM_PROCEDURES', 'LOG_NUM_DRUGS', 'DISCHARGE_LOCATION_POST_ACUTE_CARE', 'ADMISSION_TYPE_ELECTIVE', 'PARTICIPATION_DAYS', 'AGE', 'INSURANCE_MEDICARE', 'INSURANCE_PRIVATE', 'ETHNICITY_WHITE', 'ETHNICITY_BLACK']
batch_size: 64
num_epochs: 50
learning_rate: 0.001
model_params: {'input_size_curr': 13, 'hidden_size_head': 32, 'input_size_seq': 7, 'hidden_size_seq': 16, 'num_layers_seq': 1, 'dropout': 0.2}


In [6]:
# Load and preprocess data
train_df_path = os.path.join(data_directory, "train_events_preprocessed.csv")
test_df_path = os.path.join(data_directory, "test_events_preprocessed.csv")
if not os.path.exists(train_df_path) or not os.path.exists(test_df_path) or OVERWRITE_PREPROCESSED:
    from recurrent_health_events_prediction.training.train_deep_learning_models import create_preprocessed_data
    train_df_path, test_df_path = create_preprocessed_data(data_directory, training_data_config)
    print("Preprocessed data created.")
else:
    print("Using existing preprocessed data.")

Preprocessed data created.


## Inspect Train Data

In [7]:
import pandas as pd

train_df = pd.read_csv(train_df_path)

train_df[model_config['current_feat_cols']].describe()

Unnamed: 0,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,CHARLSON_INDEX,NUM_PROCEDURES,LOG_NUM_DRUGS,PARTICIPATION_DAYS,AGE
count,14069.0,14069.0,14069.0,14069.0,14069.0,14069.0,14069.0
mean,-1.131293e-16,-3.636298e-17,-4.040331e-17,-7.272596e-17,1.777746e-16,3.2322650000000004e-17,3.636298e-17
std,1.000036,1.000036,1.000036,1.000036,1.000036,1.000036,1.000036
min,-2.576391,-1.865796,-2.035783,-1.161382,-3.003269,-0.3751914,-2.745736
25%,-0.6457673,-0.7506508,-0.7893835,-0.8850599,-0.06704834,-0.3677266,-0.6042272
50%,-0.05612872,-0.2241725,0.04154961,-0.3324155,0.2655749,-0.3602618,0.1300044
75%,0.6120513,0.5143847,0.4570161,0.4965512,0.5468038,-0.3192052,0.8030501
max,4.373522,5.781556,4.611682,6.851962,1.654315,7.279998,1.598468


In [10]:
train_df[model_config["longitudinal_feat_cols"]].describe()

Unnamed: 0,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,CHARLSON_INDEX,NUM_PROCEDURES,LOG_NUM_DRUGS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION
count,14069.0,14069.0,14069.0,14069.0,14069.0,7236.0
mean,-1.131293e-16,-3.636298e-17,-4.040331e-17,-7.272596e-17,1.777746e-16,-1.09979e-16
std,1.000036,1.000036,1.000036,1.000036,1.000036,1.000069
min,-2.576391,-1.865796,-2.035783,-1.161382,-3.003269,-2.309223
25%,-0.6457673,-0.7506508,-0.7893835,-0.8850599,-0.06704834,-0.749905
50%,-0.05612872,-0.2241725,0.04154961,-0.3324155,0.2655749,0.06372261
75%,0.6120513,0.5143847,0.4570161,0.4965512,0.5468038,0.819705
max,4.373522,5.781556,4.611682,6.851962,1.654315,1.933633


In [8]:
test_df = pd.read_csv(test_df_path)

test_df[model_config['current_feat_cols']].describe()

Unnamed: 0,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,CHARLSON_INDEX,NUM_PROCEDURES,LOG_NUM_DRUGS,PARTICIPATION_DAYS,AGE
count,3537.0,3537.0,3537.0,3537.0,3537.0,3537.0,3537.0
mean,0.017071,0.00444,-0.004613,-0.001094,-0.003265,-0.042398,-0.005567
std,0.986179,0.976637,0.994128,0.98151,1.018092,0.887098,1.024637
min,-2.576391,-1.865796,-2.035783,-1.161382,-3.003269,-0.375191,-2.745736
25%,-0.631302,-0.734097,-0.789383,-0.88506,-0.067048,-0.367727,-0.604227
50%,-0.020103,-0.212858,0.04155,-0.332415,0.265575,-0.358396,0.130004
75%,0.649756,0.496684,0.457016,0.496551,0.565232,-0.319205,0.80305
max,3.990325,3.812308,5.027148,5.194029,1.548945,7.358378,1.598468


### Create PyTorch Datasets

In [9]:
pytorch_train_dataset_path = os.path.join(model_config["model_config_dir_path"], "train_dataset.pt")
pytorch_test_dataset_path = os.path.join(model_config["model_config_dir_path"], "test_dataset.pt")

if os.path.exists(pytorch_train_dataset_path) and os.path.exists(pytorch_test_dataset_path) and not OVERWRITE_PREPROCESSED:
    print("Loading existing PyTorch datasets...")
    train_dataset = torch.load(pytorch_train_dataset_path, weights_only=False)
    test_dataset = torch.load(pytorch_test_dataset_path, weights_only=False)
    print("Datasets loaded.")
else:
    print("Creating PyTorch datasets...")

    from recurrent_health_events_prediction.training.train_deep_learning_models import get_train_test_datasets

    train_dataset, test_dataset = get_train_test_datasets(
        train_df_path,
        test_df_path,
        model_config,
        training_data_config
    )
    
    torch.save(train_dataset, pytorch_train_dataset_path)
    torch.save(test_dataset, pytorch_test_dataset_path)

    print(f"Train dataset saved to {pytorch_train_dataset_path}")
    print(f"Test dataset saved to {pytorch_test_dataset_path}")

Creating PyTorch datasets...
Train dataset saved to /workspaces/msc-thesis-recurrent-health-modeling/_models/mimic/deep_learning/gru_duration_aware/train_dataset.pt
Test dataset saved to /workspaces/msc-thesis-recurrent-health-modeling/_models/mimic/deep_learning/gru_duration_aware/test_dataset.pt


## Train the Model

In [11]:
from recurrent_health_events_prediction.training.train_deep_learning_models import train
from recurrent_health_events_prediction.model.RecurrentHealthEventsDL import GRUNet

model, loss_epochs = train(
    train_dataset=train_dataset,
    model_config=model_config,
    ModelClass=GRUNet
)


Using provided model class: GRUNet

Model initialized and ready for training.
Model parameters:
  input_size_curr: 13
  hidden_size_head: 32
  input_size_seq: 7
  hidden_size_seq: 16
  num_layers_seq: 1
  dropout: 0.2

Starting training...
Number of training samples: 13415
Batch size: 64
Learning rate: 0.001
Optimizer: Adam
Loss function: CrossEntropyLoss

Epoch 1/50, Loss: 0.45157796477987655
Epoch 2/50, Loss: 0.35931548249153866
Epoch 3/50, Loss: 0.3495931325923829
Epoch 4/50, Loss: 0.3451975075261933
Epoch 5/50, Loss: 0.3427916115238553
Epoch 6/50, Loss: 0.3402510474125544
Epoch 7/50, Loss: 0.3387349426036789
Epoch 8/50, Loss: 0.3383906929975464
Epoch 9/50, Loss: 0.336984153517655
Epoch 10/50, Loss: 0.33522810900495165
Epoch 11/50, Loss: 0.33513779384749276
Epoch 12/50, Loss: 0.3350298990805944
Epoch 13/50, Loss: 0.3335432998481251
Epoch 14/50, Loss: 0.33287374944913956
Epoch 15/50, Loss: 0.33218870893830343
Epoch 16/50, Loss: 0.33038564239229473
Epoch 17/50, Loss: 0.33136111512070

In [13]:
import plotly.graph_objects as go

batch_size = model_config['batch_size']
num_samples = len(train_dataset)

fig = go.Figure()
fig.add_trace(go.Scatter(
    y=loss_epochs,
    x=list(range(1, len(loss_epochs) + 1)),
    mode='lines+markers',
    name='Training Loss'
))
fig.update_layout(
    title=f"Training Loss per Epoch (Batch size: {batch_size}, Train samples: {num_samples})",
    xaxis_title="Epoch",
    yaxis_title="Loss",
    template="plotly_white"
)
fig.write_html(os.path.join(model_config["model_config_dir_path"], "training_loss.html"))
fig.show()

In [14]:
import torch

model_save_path = os.path.join(model_config["model_config_dir_path"], "gru_model.pt")
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to /workspaces/msc-thesis-recurrent-health-modeling/_models/mimic/deep_learning/gru_duration_aware/gru_model.pt


## Load Model

In [15]:
from recurrent_health_events_prediction.model.RecurrentHealthEventsDL import GRUNet

model_save_path = os.path.join(model_config["model_config_dir_path"], "gru_model.pt")
model = GRUNet(**model_params_dict)
model.load_state_dict(torch.load(model_save_path))
model.eval()

GRUNet(
  (gru): GRU(7, 16, batch_first=True)
  (classifier_head): Sequential(
    (0): Linear(in_features=29, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=32, out_features=1, bias=True)
  )
)

## Evaluate the Model

In [16]:
from recurrent_health_events_prediction.training.train_deep_learning_models import evaluate

test_metrics = evaluate(
    model=model,
    test_dataset=test_dataset,
    batch_size=32
)

Starting evaluation...
Number of test samples: 3360
Batch size: 32

Evaluation results - Accuracy: 0.8660714030265808, F1 Score: 0.13461539149284363, AUROC: 0.7140387892723083
