# Train-Test Split (Fixed IDs)

In [1]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml
import os
import pandas as pd

from recurrent_health_events_prediction.training.utils import make_train_test_split_file, apply_train_test_split_file

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

In [8]:
DATASET = "mimic"
DATASET_SUBDIR = "multiple_hosp_patients"
INPUT_FILENAME = "last_events.csv"
OUTPUT_FILENAME = "train_test_split.csv"
TARGET_COL = "READMISSION_30_DAYS"
ID_COL = "HADM_ID"
RANDOM_STATE = 42
TEST_SIZE = 0.2

In [3]:
base_training_data_path = data_config['training_data'][DATASET]['preprocessed_path']
print(f"{base_training_data_path}")

/workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure


In [4]:
training_data_path = os.path.join(base_training_data_path, DATASET_SUBDIR)
if not os.path.exists(training_data_path):
    print("Directory does not exist")
else:
    print("Directory to import data from: ", training_data_path)

Directory to import data from:  /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/multiple_hosp_patients


In [5]:
filepath = os.path.join(training_data_path, INPUT_FILENAME)
print("Reading data from: ", filepath)
df = pd.read_csv(filepath)

Reading data from:  /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/multiple_hosp_patients/last_events.csv


In [6]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,...,IN_HOSP_DEATH_EVENT,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
0,107,182383,2121-11-30 19:24:00,2121-12-05 14:18:00,EMERGENCY,HISPANIC OR LATINO,HOME HEALTH CARE,Medicare,4.7875,4,...,0,0,1.7557,0.732641,0.693147,3.091042,1,0,120+,2
1,236,182562,2135-05-26 11:28:00,2135-05-28 16:46:00,EMERGENCY,PATIENT DECLINED TO ANSWER,HOME,Private,2.220833,2,...,0,0,1.16964,0.848175,0.693147,2.944439,1,0,120+,2
2,249,149546,2155-02-03 20:16:00,2155-02-14 11:15:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,10.624306,8,...,0,0,2.453098,1.918345,0.693147,3.713572,1,0,120+,2
3,256,188869,2170-06-15 03:06:00,2170-06-27 16:17:00,EMERGENCY,WHITE,SNF,Medicare,12.549306,4,...,0,0,2.606335,2.153851,1.098612,3.912023,1,0,30-120,1
4,291,125726,2106-04-17 12:24:00,2106-04-19 15:10:00,EMERGENCY,WHITE,HOME,Medicare,2.115278,4,...,0,0,1.136318,0.876442,0.693147,3.091042,1,0,120+,2


In [7]:
output_filepath = os.path.join(training_data_path, OUTPUT_FILENAME)
print("Saving data to: ", output_filepath)

Saving data to:  /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/multiple_hosp_patients/train_test_split.csv


In [9]:
# 1) Create and save a split file
split_map = make_train_test_split_file(
    df=df,
    id_col=ID_COL,
    target_col=TARGET_COL,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    out_path=output_filepath,
)

In [10]:
split_map

Unnamed: 0,HADM_ID,split
0,151038,train
1,130458,train
2,171620,train
3,132919,train
4,141655,train
...,...,...
1364,182589,test
1365,119575,test
1366,196379,test
1367,164391,test


In [11]:
split_counts = split_map['split'].value_counts(normalize=True).reset_index()
split_counts.columns = ['split', 'percentage']
split_counts

Unnamed: 0,split,percentage
0,train,0.799854
1,test,0.200146


In [12]:
X_train, X_test, y_train, y_test, train_ids, test_ids = apply_train_test_split_file(
    df=df,
    split_csv_path=output_filepath,
    id_col=ID_COL,
    target_col=TARGET_COL
)

In [13]:
X_train.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,...,IN_HOSP_DEATH_EVENT,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
0,107,2121-11-30 19:24:00,2121-12-05 14:18:00,EMERGENCY,HISPANIC OR LATINO,HOME HEALTH CARE,Medicare,4.7875,4,"['congestive_heart_failure', 'myocardial_infar...",...,0,0,1.7557,0.732641,0.693147,3.091042,1,0,120+,2
2,249,2155-02-03 20:16:00,2155-02-14 11:15:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,10.624306,8,"['myocardial_infarct', 'paraplegia', 'chronic_...",...,0,0,2.453098,1.918345,0.693147,3.713572,1,0,120+,2
3,256,2170-06-15 03:06:00,2170-06-27 16:17:00,EMERGENCY,WHITE,SNF,Medicare,12.549306,4,"['peripheral_vascular_disease', 'myocardial_in...",...,0,0,2.606335,2.153851,1.098612,3.912023,1,0,30-120,1
5,305,2127-06-19 23:49:00,2127-07-16 13:58:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,26.589583,3,"['diabetes_with_cc', 'congestive_heart_failure...",...,0,0,3.317438,2.028721,1.098612,4.189655,1,0,120+,2
6,323,2119-09-21 20:34:00,2119-10-06 18:50:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,14.927778,4,"['peripheral_vascular_disease', 'diabetes_with...",...,0,0,2.768065,2.564941,1.098612,3.912023,1,0,30-120,1


In [14]:
X_test.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,...,IN_HOSP_DEATH_EVENT,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
1,236,2135-05-26 11:28:00,2135-05-28 16:46:00,EMERGENCY,PATIENT DECLINED TO ANSWER,HOME,Private,2.220833,2,"['other', 'mild_liver_disease']",...,0,0,1.16964,0.848175,0.693147,2.944439,1,0,120+,2
4,291,2106-04-17 12:24:00,2106-04-19 15:10:00,EMERGENCY,WHITE,HOME,Medicare,2.115278,4,"['rheumatic_disease', 'myocardial_infarct', 'o...",...,0,0,1.136318,0.876442,0.693147,3.091042,1,0,120+,2
13,580,2138-02-13 23:43:00,2138-02-17 02:30:00,EMERGENCY,BLACK/AFRICAN AMERICAN,SNF,Medicare,3.115972,2,"['other', 'chronic_pulmonary_disease']",...,0,0,1.414875,1.528524,1.098612,3.218876,1,0,120+,2
14,618,2117-12-10 14:46:00,2117-12-15 16:12:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,5.059722,3,"['congestive_heart_failure', 'myocardial_infar...",...,0,0,1.801664,1.424296,0.693147,3.367296,1,0,120+,2
17,773,2109-01-23 21:51:00,2109-01-28 17:00:00,EMERGENCY,WHITE,HOME,Private,4.797917,6,"['peptic_ulcer_disease', 'other', 'mild_liver_...",...,0,0,1.757499,0.956948,1.098612,2.944439,1,0,0-30,0


In [15]:
train_ids[:5], test_ids[:5]

(array([182383, 149546, 188869, 122211, 128132]),
 array([182562, 125726, 195290, 155036, 160425]))

### Sanity Check

In [16]:
train_mask = (split_map['split'] == 'train')
test_mask = (split_map['split'] == 'test')

train_ids_split_map = split_map.loc[train_mask, ID_COL].values
test_ids_split_map = split_map.loc[test_mask, ID_COL].values

assert set(train_ids) == set(train_ids_split_map)
assert set(test_ids) == set(test_ids_split_map)