# Split the data into training and validation sets

In [None]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import os
import yaml
import pandas as pd

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

full_data_path = "/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/mimic_cleaned_v2/all_events.csv"

In [2]:
df = pd.read_csv(full_data_path)
df["ADMITTIME"] = pd.to_datetime(df["ADMITTIME"])
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,...,IN_HOSP_DEATH_EVENT,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
0,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,ELECTIVE,WHITE,HOME HEALTH CARE,Medicare,16.364583,2,...,0,0,2.854433,1.541783,0.0,3.637586,1,0,120+,2
1,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,13.165278,6,...,0,0,2.650794,1.93177,0.0,3.970292,1,0,120+,2
2,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,EMERGENCY,UNKNOWN/NOT SPECIFIED,HOME,Medicare,6.988889,3,...,0,1,2.078052,1.14444,0.0,3.433987,1,0,120+,2
3,28,162569,2177-09-01 07:15:00,2177-09-06 16:00:00,ELECTIVE,WHITE,HOME HEALTH CARE,Medicare,5.364583,4,...,0,0,1.850749,0.752551,0.0,3.806662,1,0,120+,2
4,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,URGENT,UNKNOWN/NOT SPECIFIED,HOME HEALTH CARE,Medicare,5.013889,2,...,0,0,1.794072,1.046344,0.0,0.0,1,0,120+,2


In [3]:
from recurrent_health_events_prediction.training.split import split_train_test_stratified_group

res = split_train_test_stratified_group(
    df,
    subject_id_col="SUBJECT_ID",
    label_col="READMISSION_30_DAYS",
    charlson_col="CHARLSON_INDEX",
    test_size=0.2,
    random_state=42,
    min_stratum_size=2,
)

res.balance_report

Unnamed: 0,patients,num_visits_mean,num_visits_median,any_readmission_rate,mean_readmission_mean,mean_charlson_mean
train,10826.0,1.299557,1.0,0.119989,0.083821,4.889161
test,2707.0,1.306612,1.0,0.120798,0.084985,4.890816


In [4]:
res.table

Unnamed: 0,SUBJECT_ID,num_visits,any_readmission,mean_readmission,mean_CHARLSON_INDEX,b_num_visits,b_mean_readmission,b_mean_charlson,b_any_readm,strata
0,6,1,0,0.0,4.0,1,0,2-4,0,1|0|0|2-4
1,21,1,0,0.0,10.0,1,0,8-10,0,1|0|0|8-10
2,26,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
3,28,1,0,0.0,6.0,1,0,4-6,0,1|0|0|4-6
4,30,1,0,0.0,1.0,1,0,0-2,0,1|0|0|0-2
...,...,...,...,...,...,...,...,...,...,...
13528,99938,1,0,0.0,9.0,1,0,8-10,0,1|0|0|8-10
13529,99939,1,0,0.0,2.0,1,0,0-2,0,1|0|0|0-2
13530,99965,1,0,0.0,5.0,1,0,4-6,0,1|0|0|4-6
13531,99982,2,1,0.5,3.5,2,"(0.3,0.6]",2-4,1,"2|(0.3,0.6]|1|2-4"


In [5]:
train_df = df[df['SUBJECT_ID'].isin(res.train_ids)]
test_df = df[df['SUBJECT_ID'].isin(res.test_ids)]

In [6]:
train_df = train_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
test_df = test_df.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])

In [8]:
train_test_data_dir = os.path.dirname(full_data_path)
print(train_test_data_dir)

/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/mimic_cleaned_v2


In [9]:
train_df.to_csv(train_test_data_dir + "/train_events.csv", index=False)
test_df.to_csv(train_test_data_dir + "/test_events.csv", index=False)