# Create Train and Test Dataframes

In [1]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import os
import yaml
import pandas as pd

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

DATASET = "mimic"
USE_MULTIPLE_HOSP_PATIENTS = False # Set to True to use only patients with multiple hospitalizations
SKIP_ONEHOT_ENCODING = True # Set to True to skip one-hot encoding step

training_data_config = data_config['training_data'][DATASET]

if USE_MULTIPLE_HOSP_PATIENTS:
    # Adjust path for multiple hospital patients data
    data_directory = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients"
else:
    data_directory = training_data_config['data_directory']

# Adjust path for one-hot encoded data if skipping encoding
already_encoded_data_path = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/train_full.csv"

# Define output filenames, adjust if necessary
output_train_df_filename = "train_tuning.csv"
output_test_df_filename = "validation_tuning.csv"

full_data_path = os.path.join(data_directory, "all_events.csv")
if not SKIP_ONEHOT_ENCODING:
    print(f"Full data path: {full_data_path}")
else:
    print(f"One-hot encoded data path: {already_encoded_data_path}")


One-hot encoded data path: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test/train_full.csv


## Load Preprocessed Data

In [2]:
if SKIP_ONEHOT_ENCODING:
    print("Skipping one-hot encoding as per configuration.")
    df = pd.read_csv(already_encoded_data_path)
    df["ADMITTIME"] = pd.to_datetime(df["ADMITTIME"])
else:
    df = pd.read_csv(full_data_path)
    df["ADMITTIME"] = pd.to_datetime(df["ADMITTIME"])

Skipping one-hot encoding as per configuration.


In [5]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,6.988889,3,"['other', 'congestive_heart_failure', 'myocard...",False,False,True,...,False,False,False,True,False,False,False,False,True,False
1,28,162569,2177-09-01 07:15:00,2177-09-06 16:00:00,5.364583,4,"['other', 'diabetes_without_cc', 'myocardial_i...",True,True,False,...,False,False,False,True,False,False,False,True,True,False
2,32,175413,2170-04-04 08:00:00,2170-04-23 12:45:00,19.197917,2,"['other', 'chronic_pulmonary_disease']",False,True,False,...,False,False,True,False,False,False,False,True,True,False
3,33,176176,2116-12-23 22:30:00,2116-12-27 12:05:00,3.565972,2,"['other', 'chronic_pulmonary_disease']",False,True,False,...,False,False,False,True,False,False,False,False,True,False
4,34,115799,2186-07-18 16:46:00,2186-07-20 16:00:00,1.968056,3,"['other', 'congestive_heart_failure', 'myocard...",False,False,True,...,False,False,False,True,False,False,False,True,True,False


In [6]:
assert df['IN_HOSP_DEATH_EVENT'].sum().item() == 0, "There are in-hospital death events in the dataset!"
assert (df['IS_HISTORICAL_EVENT'] + df['IS_LAST_EVENT']).sum().item() != 0, "There are events that are neither historical nor last events, or both!"

## Remap Some Categorical Features

In [7]:
from recurrent_health_events_prediction.preprocessing.utils import remap_discharge_location, remap_mimic_races
if SKIP_ONEHOT_ENCODING:
    print("Skipping one-hot encoding as per configuration.")
else:
    df = remap_discharge_location(df)
    df = remap_mimic_races(df)

Skipping one-hot encoding as per configuration.


## One-Hot Encode Features

In [8]:
from recurrent_health_events_prediction.training.utils import (
    preprocess_features_to_one_hot_encode,
)
if SKIP_ONEHOT_ENCODING:
    print("Skipping one-hot encoding as per configuration.")
else:
    features_to_one_hot_encode = training_data_config["features_to_one_hot_encode"]
    one_hot_cols_to_drop = training_data_config["one_hot_cols_to_drop"]
    print("Features to one-hot encode:", features_to_one_hot_encode)
    print("One-hot columns to drop:", one_hot_cols_to_drop)
    
    df, new_cols = preprocess_features_to_one_hot_encode(
        df,
        features_to_encode=features_to_one_hot_encode,
        one_hot_cols_to_drop=one_hot_cols_to_drop,
    )
    df[new_cols].head()

Skipping one-hot encoding as per configuration.


## Split Train and Test Sets

In [3]:
from recurrent_health_events_prediction.training.split import split_train_test_stratified_group

res = split_train_test_stratified_group(
    df,
    subject_id_col="SUBJECT_ID",
    label_col="READMISSION_30_DAYS",
    charlson_col="CHARLSON_INDEX",
    test_size=0.2,
    random_state=42,
    min_stratum_size=2,
)

res.balance_report

Unnamed: 0,patients,num_visits_mean,num_visits_median,any_readmission_rate,mean_readmission_mean,mean_charlson_mean
train,8660.0,1.304388,1.0,0.120323,0.083998,4.894742
test,2166.0,1.28024,1.0,0.118652,0.083115,4.866846


In [4]:
res.table.head(10)

Unnamed: 0,SUBJECT_ID,num_visits,any_readmission,mean_readmission,mean_CHARLSON_INDEX,b_num_visits,b_mean_readmission,b_mean_charlson,b_any_readm,strata
0,26,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
1,28,1,0,0.0,6.0,1,0,4-6,0,1|0|0|4-6
2,32,1,0,0.0,1.0,1,0,0-2,0,1|0|0|0-2
3,33,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
4,34,1,0,0.0,2.0,1,0,0-2,0,1|0|0|0-2
5,36,2,1,0.5,3.0,2,"(0.3,0.6]",2-4,1,"2|(0.3,0.6]|1|2-4"
6,37,1,0,0.0,6.0,1,0,4-6,0,1|0|0|4-6
7,38,1,0,0.0,4.0,1,0,2-4,0,1|0|0|2-4
8,42,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
9,55,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6


In [5]:
train_df = df[df['SUBJECT_ID'].isin(res.train_ids)]
test_df = df[df['SUBJECT_ID'].isin(res.test_ids)]

In [6]:
train_df = train_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
test_df = test_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])

In [7]:
if not SKIP_ONEHOT_ENCODING:
    train_test_data_dir = os.path.dirname(full_data_path) + "/train_test/"
else:
    train_test_data_dir = os.path.dirname(already_encoded_data_path)
os.makedirs(train_test_data_dir, exist_ok=True)
print(train_test_data_dir)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/train_test


In [8]:
print("Output train_df file: ", output_train_df_filename)
print("Output test_df file: ", output_test_df_filename)

Output train_df file:  train_tuning.csv
Output test_df file:  validation_tuning.csv


In [9]:
print("Number of training samples: ", train_df.shape[0])
print("Number of test samples: ", test_df.shape[0])

Number of training samples:  11296
Number of test samples:  2773


In [11]:
train_df.to_csv(os.path.join(train_test_data_dir, output_train_df_filename), index=False)
test_df.to_csv(os.path.join(train_test_data_dir, output_test_df_filename), index=False)

In [15]:
train_df.head(10)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,20.739583,3,"['other', 'chronic_pulmonary_disease', 'cerebr...",False,True,False,...,False,False,False,False,True,False,False,True,True,False
1,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,7.497222,3,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,True,False
2,124,134369,2165-05-21 21:02:00,2165-06-06 16:00:00,15.790278,4,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,False,True
3,138,108120,2131-10-31 08:00:00,2131-11-06 12:54:00,6.204167,4,"['other', 'congestive_heart_failure', 'myocard...",False,True,True,...,False,False,False,False,True,False,False,True,True,False
4,138,188284,2133-12-21 23:52:00,2134-01-11 13:00:00,20.547222,3,"['other', 'congestive_heart_failure', 'maligna...",False,False,True,...,False,False,False,False,True,False,False,True,True,False
5,222,145243,2137-07-15 15:31:00,2137-07-17 12:00:00,1.853472,3,"['other', 'myocardial_infarct', 'cerebrovascul...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
6,222,105083,2141-02-18 23:10:00,2141-02-21 15:50:00,2.694444,4,"['other', 'peripheral_vascular_disease', 'myoc...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
7,222,137006,2142-06-11 07:15:00,2142-06-19 12:30:00,8.21875,7,"['myocardial_infarct', 'other', 'peripheral_va...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
8,222,188038,2142-06-23 05:02:00,2142-07-01 17:30:00,8.519444,5,"['myocardial_infarct', 'other', 'malignant_can...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
9,236,151459,2134-10-04 01:19:00,2134-10-15 16:00:00,11.611806,3,"['mild_liver_disease', 'malignant_cancer', 'ot...",False,False,False,...,False,False,False,False,True,False,False,False,True,False


## Additional metrics comparison

### Admission Type

In [16]:
train_df['ADMISSION_TYPE_ELECTIVE'].mean(), test_df['ADMISSION_TYPE_ELECTIVE'].mean()

(np.float64(0.10113137843454167), np.float64(0.11279826464208242))

### Readmission Duration Category

In [17]:
train_df['READMISSION_TIME_CAT'].value_counts(normalize=True)

READMISSION_TIME_CAT
120+      0.49619
0-30      0.25329
30-120    0.25052
Name: proportion, dtype: float64

In [18]:
test_df['READMISSION_TIME_CAT'].value_counts(normalize=True)

READMISSION_TIME_CAT
120+      0.527115
0-30      0.238612
30-120    0.234273
Name: proportion, dtype: float64