# Create Train and Test Dataframes

In [2]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import os
import yaml
import pandas as pd

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

DATASET = "mimic"
training_data_config = data_config['training_data'][DATASET]
# data_directory = training_data_config['preprocessed_output_path_1st_round']
data_directory = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients"

full_data_path = os.path.join(data_directory, "all_events.csv")
print(f"Full data path: {full_data_path}")

Full data path: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/all_events.csv


## Load Preprocessed Data (1st round)

In [3]:
df = pd.read_csv(full_data_path)
df["ADMITTIME"] = pd.to_datetime(df["ADMITTIME"])
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,...,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_PARTICIPATION_DAYS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
0,36,182104,2131-04-30 07:15:00,2131-05-08 14:00:00,EMERGENCY,WHITE,HOME HEALTH CARE,Medicare,8.28125,2,...,0,2.227996,0.746496,0.0,2.197225,3.951244,0,1,0-30,0
1,36,122659,2131-05-12 19:49:00,2131-05-25 13:30:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,12.736806,2,...,0,2.620079,2.061729,0.693147,3.258097,4.290459,1,0,120+,2
2,107,191941,2115-02-20 17:41:00,2115-02-21 16:30:00,EMERGENCY,HISPANIC OR LATINO,HOME,Private,0.950694,2,...,0,0.668185,0.702404,0.0,0.693147,2.397895,0,1,120+,2
3,107,182383,2121-11-30 19:24:00,2121-12-05 14:18:00,EMERGENCY,HISPANIC OR LATINO,HOME HEALTH CARE,Medicare,4.7875,4,...,0,1.7557,0.732641,0.693147,7.816014,3.091042,1,0,120+,2
4,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,EMERGENCY,WHITE,HOME HEALTH CARE,Private,20.739583,3,...,0,3.079135,1.591651,0.0,3.044522,3.663562,0,1,120+,2


In [4]:
assert df['IN_HOSP_DEATH_EVENT'].sum().item() == 0, "There are in-hospital death events in the dataset!"
assert (df['IS_HISTORICAL_EVENT'] + df['IS_LAST_EVENT']).sum().item() != 0, "There are events that are neither historical nor last events, or both!"

## Remap Some Categorical Features

In [5]:
from recurrent_health_events_prediction.preprocessing.utils import remap_discharge_location, remap_mimic_races

df = remap_discharge_location(df)
df = remap_mimic_races(df)

## One-Hot Encode Features

In [6]:
from recurrent_health_events_prediction.training.utils import (
    preprocess_features_to_one_hot_encode,
)

features_to_one_hot_encode = training_data_config["features_to_one_hot_encode"]
one_hot_cols_to_drop = training_data_config["one_hot_cols_to_drop"]

df, new_cols = preprocess_features_to_one_hot_encode(
    df,
    features_to_encode=features_to_one_hot_encode,
    one_hot_cols_to_drop=one_hot_cols_to_drop,
)
df[new_cols].head()

Unnamed: 0,ADMISSION_TYPE_ELECTIVE,ADMISSION_TYPE_URGENT,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,GENDER_M,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE
0,False,False,True,False,False,False,True,True,False,False,True,False
1,False,False,False,True,False,False,True,True,False,False,True,False
2,False,False,True,False,False,True,False,True,False,False,False,True
3,False,False,True,False,False,True,False,True,False,False,True,False
4,False,False,True,False,False,False,True,True,False,False,False,True


## Split Train and Test Sets

In [7]:
from recurrent_health_events_prediction.training.split import split_train_test_stratified_group

res = split_train_test_stratified_group(
    df,
    subject_id_col="SUBJECT_ID",
    label_col="READMISSION_30_DAYS",
    charlson_col="CHARLSON_INDEX",
    test_size=0.2,
    random_state=42,
    min_stratum_size=2,
)

res.balance_report

Unnamed: 0,patients,num_visits_mean,num_visits_median,any_readmission_rate,mean_readmission_mean,mean_charlson_mean
train,1502.0,3.190413,2.0,0.501332,0.241257,5.036433
test,376.0,3.082447,2.0,0.507979,0.247707,5.095147


In [8]:
res.table.head(10)

Unnamed: 0,SUBJECT_ID,num_visits,any_readmission,mean_readmission,mean_CHARLSON_INDEX,b_num_visits,b_mean_readmission,b_mean_charlson,b_any_readm,strata
0,36,2,1,0.5,3.0,2,"(0.3,0.6]",2-4,1,"2|(0.3,0.6]|1|2-4"
1,107,2,0,0.0,5.0,2,0,4-6,0,2|0|0|4-6
2,124,3,0,0.0,5.333333,3,0,4-6,0,3|0|0|4-6
3,138,2,0,0.0,3.0,2,0,2-4,0,2|0|0|2-4
4,188,6,0,0.0,4.666667,6-10,0,4-6,0,6-10|0|0|4-6
5,222,4,1,0.25,7.5,4-5,"(0.1,0.3]",6-8,1,"4-5|(0.1,0.3]|1|6-8"
6,236,2,0,0.0,3.0,2,0,2-4,0,2|0|0|2-4
7,249,2,0,0.0,9.5,2,0,8-10,0,2|0|0|8-10
8,256,3,0,0.0,5.666667,3,0,4-6,0,3|0|0|4-6
9,291,2,0,0.0,5.0,2,0,4-6,0,2|0|0|4-6


In [9]:
train_df = df[df['SUBJECT_ID'].isin(res.train_ids)]
test_df = df[df['SUBJECT_ID'].isin(res.test_ids)]

In [10]:
train_df = train_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
test_df = test_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])

In [11]:
train_test_data_dir = os.path.dirname(full_data_path)
print(train_test_data_dir)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients


In [12]:
train_df.to_csv(train_test_data_dir + "/train_events.csv", index=False)
test_df.to_csv(train_test_data_dir + "/test_events.csv", index=False)

In [14]:
train_df.head(10)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
4,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,20.739583,3,"['other', 'chronic_pulmonary_disease', 'cerebr...",False,True,False,...,False,False,False,False,True,False,False,True,True,False
5,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,7.497222,3,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,True,False
6,124,134369,2165-05-21 21:02:00,2165-06-06 16:00:00,15.790278,4,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,False,True
7,138,108120,2131-10-31 08:00:00,2131-11-06 12:54:00,6.204167,4,"['other', 'congestive_heart_failure', 'myocard...",False,True,True,...,False,False,False,False,True,False,False,True,True,False
8,138,188284,2133-12-21 23:52:00,2134-01-11 13:00:00,20.547222,3,"['other', 'congestive_heart_failure', 'maligna...",False,False,True,...,False,False,False,False,True,False,False,True,True,False
15,222,145243,2137-07-15 15:31:00,2137-07-17 12:00:00,1.853472,3,"['other', 'myocardial_infarct', 'cerebrovascul...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
16,222,105083,2141-02-18 23:10:00,2141-02-21 15:50:00,2.694444,4,"['other', 'peripheral_vascular_disease', 'myoc...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
17,222,137006,2142-06-11 07:15:00,2142-06-19 12:30:00,8.21875,7,"['myocardial_infarct', 'other', 'peripheral_va...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
18,222,188038,2142-06-23 05:02:00,2142-07-01 17:30:00,8.519444,5,"['myocardial_infarct', 'other', 'malignant_can...",False,False,False,...,False,False,False,True,False,False,False,True,True,False
19,236,151459,2134-10-04 01:19:00,2134-10-15 16:00:00,11.611806,3,"['mild_liver_disease', 'malignant_cancer', 'ot...",False,False,False,...,False,False,False,False,True,False,False,False,True,False


## Additional metrics comparison

### Admission Type

In [15]:
train_df['ADMISSION_TYPE_ELECTIVE'].mean(), test_df['ADMISSION_TYPE_ELECTIVE'].mean()

(np.float64(0.10225375626043405), np.float64(0.0811044003451251))

### Readmission Duration Category

In [16]:
train_df['READMISSION_TIME_CAT'].value_counts(normalize=True)

READMISSION_TIME_CAT
120+      0.499165
0-30      0.251878
30-120    0.248957
Name: proportion, dtype: float64

In [17]:
test_df['READMISSION_TIME_CAT'].value_counts(normalize=True)

READMISSION_TIME_CAT
120+      0.497843
30-120    0.255393
0-30      0.246764
Name: proportion, dtype: float64