# Create Train and Test Dataframes

In [None]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import os
import yaml
import pandas as pd

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

DATASET = "mimic"
training_data_config = data_config['training_data'][DATASET]
data_directory = training_data_config['preprocessed_output_path_1st_round']

full_data_path = os.path.join(data_directory, "all_events.csv")
print(f"Full data path: {full_data_path}")

Full data path: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/all_events.csv


## Load Preprocessed Data (1st round)

In [4]:
df = pd.read_csv(full_data_path)
df["ADMITTIME"] = pd.to_datetime(df["ADMITTIME"])
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,...,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_PARTICIPATION_DAYS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
0,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,ELECTIVE,WHITE,HOME HEALTH CARE,Medicare,16.364583,2,...,0,2.854433,1.541783,0.0,2.833213,3.637586,1,0,120+,2
1,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,13.165278,6,...,0,2.650794,1.93177,0.0,2.639057,3.970292,1,0,120+,2
2,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,EMERGENCY,UNKNOWN/NOT SPECIFIED,HOME,Medicare,6.988889,3,...,1,2.078052,1.14444,0.0,1.94591,3.433987,1,0,120+,2
3,28,162569,2177-09-01 07:15:00,2177-09-06 16:00:00,ELECTIVE,WHITE,HOME HEALTH CARE,Medicare,5.364583,4,...,0,1.850749,0.752551,0.0,1.791759,3.806662,1,0,120+,2
4,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,URGENT,UNKNOWN/NOT SPECIFIED,HOME HEALTH CARE,Medicare,5.013889,2,...,0,1.794072,1.046344,0.0,1.791759,0.0,1,0,120+,2


In [5]:
assert df['IN_HOSP_DEATH_EVENT'].sum().item() == 0, "There are in-hospital death events in the dataset!"
assert (df['IS_HISTORICAL_EVENT'] + df['IS_LAST_EVENT']).sum().item() != 0, "There are events that are neither historical nor last events, or both!"

## Remap Some Categorical Features

In [6]:
from recurrent_health_events_prediction.preprocessing.utils import remap_discharge_location, remap_mimic_races

df = remap_discharge_location(df)
df = remap_mimic_races(df)

## One-Hot Encode Features

In [7]:
from recurrent_health_events_prediction.training.utils import (
    preprocess_features_to_one_hot_encode,
)

features_to_one_hot_encode = training_data_config["features_to_one_hot_encode"]
one_hot_cols_to_drop = training_data_config["one_hot_cols_to_drop"]

df, new_cols = preprocess_features_to_one_hot_encode(
    df,
    features_to_encode=features_to_one_hot_encode,
    one_hot_cols_to_drop=one_hot_cols_to_drop,
)
df[new_cols].head()

Unnamed: 0,ADMISSION_TYPE_ELECTIVE,ADMISSION_TYPE_URGENT,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,GENDER_M,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE
0,True,False,True,False,False,False,True,False,False,False,True,False
1,False,False,False,True,False,False,True,True,False,False,True,False
2,False,False,True,False,False,False,False,True,False,False,True,False
3,True,False,True,False,False,False,True,True,False,False,True,False
4,False,True,True,False,False,False,False,True,False,False,True,False


## Split Train and Test Sets

In [8]:
from recurrent_health_events_prediction.training.split import split_train_test_stratified_group

res = split_train_test_stratified_group(
    df,
    subject_id_col="SUBJECT_ID",
    label_col="READMISSION_30_DAYS",
    charlson_col="CHARLSON_INDEX",
    test_size=0.2,
    random_state=42,
    min_stratum_size=2,
)

res.balance_report

Unnamed: 0,patients,num_visits_mean,num_visits_median,any_readmission_rate,mean_readmission_mean,mean_charlson_mean
train,10826.0,1.299557,1.0,0.119989,0.083821,4.889161
test,2707.0,1.306612,1.0,0.120798,0.084985,4.890816


In [9]:
res.table.head(10)

Unnamed: 0,SUBJECT_ID,num_visits,any_readmission,mean_readmission,mean_CHARLSON_INDEX,b_num_visits,b_mean_readmission,b_mean_charlson,b_any_readm,strata
0,6,1,0,0.0,4.0,1,0,2-4,0,1|0|0|2-4
1,21,1,0,0.0,10.0,1,0,8-10,0,1|0|0|8-10
2,26,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
3,28,1,0,0.0,6.0,1,0,4-6,0,1|0|0|4-6
4,30,1,0,0.0,1.0,1,0,0-2,0,1|0|0|0-2
5,32,1,0,0.0,1.0,1,0,0-2,0,1|0|0|0-2
6,33,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
7,34,1,0,0.0,2.0,1,0,0-2,0,1|0|0|0-2
8,36,2,1,0.5,3.0,2,"(0.3,0.6]",2-4,1,"2|(0.3,0.6]|1|2-4"
9,37,1,0,0.0,6.0,1,0,4-6,0,1|0|0|4-6


In [10]:
train_df = df[df['SUBJECT_ID'].isin(res.train_ids)]
test_df = df[df['SUBJECT_ID'].isin(res.test_ids)]

In [11]:
train_df = train_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
test_df = test_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])

In [12]:
train_test_data_dir = os.path.dirname(full_data_path)
print(train_test_data_dir)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes


In [13]:
train_df.to_csv(train_test_data_dir + "/train_events.csv", index=False)
test_df.to_csv(train_test_data_dir + "/test_events.csv", index=False)

## Additional metrics comparison

### Admission Type

In [15]:
train_df['ADMISSION_TYPE_ELECTIVE'].mean(), test_df['ADMISSION_TYPE_ELECTIVE'].mean()

(np.float64(0.15686971355462365), np.float64(0.16115351993214588))

### Readmission Duration Category

In [16]:
train_df['READMISSION_TIME_CAT'].value_counts(normalize=True)

READMISSION_TIME_CAT
120+      0.757907
0-30      0.123250
30-120    0.118843
Name: proportion, dtype: float64

In [17]:
test_df['READMISSION_TIME_CAT'].value_counts(normalize=True)

READMISSION_TIME_CAT
120+      0.752050
0-30      0.124682
30-120    0.123268
Name: proportion, dtype: float64