# Generate Inference Data for Frontend Demo

In [1]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml

import pandas as pd
import numpy as np

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    config = yaml.safe_load(f)

In [37]:
OUTPUT_DIR = 'C:/Users/User/projects/msc-thesis-recurrent-health-modeling/data/mimic-api-demo'

In [2]:
dataset_config = config['dataset']['mimic']
dataset_config['path'] = 'C:\\Users\\User\\projects\\msc-thesis-recurrent-health-modeling\\data\\mimic-iii-dataset'

inference_config = config['inference']['mimic']
required_cols = inference_config['required_cols']

preprocessing_config = config['training_data']['mimic']
patient_id_col = preprocessing_config['patient_id_col']
hosp_id_col = preprocessing_config['hosp_id_col']
time_col = preprocessing_config['time_col']

In [3]:
required_cols

{'admissions_diagnoses': ['HADM_ID',
  'SUBJECT_ID',
  'ADMITTIME',
  'DISCHTIME',
  'ADMISSION_TYPE',
  'INSURANCE',
  'ETHNICITY',
  'DISCHARGE_LOCATION',
  'ICD9_CODE'],
 'icu_stays': ['SUBJECT_ID', 'HADM_ID', 'INTIME', 'OUTTIME'],
 'prescriptions': ['HADM_ID', 'SUBJECT_ID', 'DRUG'],
 'procedures': ['HADM_ID', 'SUBJECT_ID', 'ICD9_CODE'],
 'patients': ['SUBJECT_ID', 'GENDER', 'DOB']}

In [22]:
print("Required Columns for API\n")
for table in required_cols:
    print(f"Table: {table}")
    for col in required_cols[table]:
        print(f"  - {col}")

Required Columns for API

Table: admissions_diagnoses
  - HADM_ID
  - SUBJECT_ID
  - ADMITTIME
  - DISCHTIME
  - ADMISSION_TYPE
  - INSURANCE
  - ETHNICITY
  - DISCHARGE_LOCATION
  - ICD9_CODE
Table: icu_stays
  - SUBJECT_ID
  - HADM_ID
  - INTIME
  - OUTTIME
Table: prescriptions
  - HADM_ID
  - SUBJECT_ID
  - DRUG
Table: procedures
  - HADM_ID
  - SUBJECT_ID
  - ICD9_CODE
Table: patients
  - SUBJECT_ID
  - GENDER
  - DOB


In [23]:
base_dir = "C:/Users/User/projects/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients"
train_data_path = base_dir + "/train_test/train_final.csv"
train_df = pd.read_csv(train_data_path)

test_data_path = base_dir + "/train_test/test.csv"
test_df = pd.read_csv(test_data_path)


In [24]:
train_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,20.739583,3,"['other', 'chronic_pulmonary_disease', 'cerebr...",False,True,False,...,False,False,False,False,True,False,False,True,True,False
1,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,7.497222,3,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,True,False
2,124,134369,2165-05-21 21:02:00,2165-06-06 16:00:00,15.790278,4,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,False,True
3,138,108120,2131-10-31 08:00:00,2131-11-06 12:54:00,6.204167,4,"['other', 'congestive_heart_failure', 'myocard...",False,True,True,...,False,False,False,False,True,False,False,True,True,False
4,138,188284,2133-12-21 23:52:00,2134-01-11 13:00:00,20.547222,3,"['other', 'congestive_heart_failure', 'maligna...",False,False,True,...,False,False,False,False,True,False,False,True,True,False


In [27]:
subject_ids_train = train_df[patient_id_col].unique()
subject_ids_test = test_df[patient_id_col].unique()

print(f"Number of unique patients in train set: {len(subject_ids_train)}")
print(f"Number of unique patients in test set: {len(subject_ids_test)}")

Number of unique patients in train set: 1351
Number of unique patients in test set: 376


In [28]:
final_num_subjects = 1000
percentage_train = 0.7

num_train_subjects = int(final_num_subjects * percentage_train)
num_test_subjects = final_num_subjects - num_train_subjects

print(f"Number of subjects for inference demo: {final_num_subjects}")
print(f"  - Train subjects: {num_train_subjects}")
print(f"  - Test subjects: {num_test_subjects}")

Number of subjects for inference demo: 1000
  - Train subjects: 700
  - Test subjects: 300


In [29]:
np.random.seed(42)

demo_subjects_train = np.random.choice(subject_ids_train, size=num_train_subjects, replace=False)
demo_subjects_test = np.random.choice(subject_ids_test, size=num_test_subjects, replace=False)

demo_train_df = train_df[train_df[patient_id_col].isin(demo_subjects_train)]
demo_test_df = test_df[test_df[patient_id_col].isin(demo_subjects_test)]

demo_df = pd.concat([demo_train_df, demo_test_df], axis=0)

event_ids = demo_df[hosp_id_col].unique()

In [30]:
print("HADM_IDs in demo data:")
print(event_ids)

HADM_IDs in demo data:
[145243 105083 137006 ... 126023 151454 112748]


In [34]:
subject_ids = demo_df[patient_id_col].unique()
print(f"Number of unique patients in demo data: {len(subject_ids)}")

Number of unique patients in demo data: 1000


In [31]:
from recurrent_health_events_prediction.data_extraction.DataExtractor import DataExtractorMIMIC
from recurrent_health_events_prediction.data_extraction.data_types import DiseaseType

dataset_config = config['dataset']['mimic']

print(f"Dataset path: {dataset_config['path']}")

selected_diseases = preprocessing_config.get("selected_diseases", [DiseaseType.CHRONIC_PULMONARY_DISEASE,
                                                    DiseaseType.CONGESTIVE_HEART_FAILURE,
                                                    DiseaseType.DIABETES_WITH_COMPLICATION,
                                                    DiseaseType.RENAL_DISEASE])
selected_diseases = [DiseaseType(d) if isinstance(d, str) else d for d in selected_diseases]
data_extractor = DataExtractorMIMIC(dataset_config, selected_diseases)

data_extractor.load_data()
admissions_df = data_extractor.get_admissions_df()
icu_stays_df = data_extractor.get_icu_stays_df()
procedures_df = data_extractor.get_procedures_df()
prescriptions_df = data_extractor.get_prescriptions_df()
patients_metadata_df = data_extractor.get_patients_df()

Dataset path: C:\Users\User\projects\msc-thesis-recurrent-health-modeling\data\mimic-iii-dataset


  prescriptions_df = pd.read_csv(self.data_path + '/PRESCRIPTIONS.csv')


In [32]:
admissions_df = data_extractor.get_admissions_df()
icu_stays_df = data_extractor.get_icu_stays_df()
procedures_df = data_extractor.get_procedures_df()
prescriptions_df = data_extractor.get_prescriptions_df()
patients_metadata_df = data_extractor.get_patients_df()

In [None]:
admissions_df = admissions_df[required_cols["admissions_diagnoses"]].copy()
icu_stays_df = icu_stays_df[required_cols["icu_stays"]].copy()
procedures_df = procedures_df[required_cols["procedures"]].copy()
prescriptions_df = prescriptions_df[required_cols["prescriptions"]].copy()
patients_metadata_df = patients_metadata_df[required_cols["patients"]].copy()

In [35]:
admissions_selected_df = admissions_df[admissions_df[hosp_id_col].isin(event_ids)].copy()
icu_stays_selected_df = icu_stays_df[icu_stays_df[hosp_id_col].isin(event_ids)].copy()
procedures_selected_df = procedures_df[procedures_df[hosp_id_col].isin(event_ids)].copy()
prescriptions_selected_df = prescriptions_df[prescriptions_df[hosp_id_col].isin(event_ids)].copy()
patients_metadata_selected_df = patients_metadata_df[patients_metadata_df[patient_id_col].isin(subject_ids)].copy()

In [39]:
admissions_selected_df[time_col] = pd.to_datetime(admissions_selected_df[time_col])
icu_stays_selected_df['INTIME'] = pd.to_datetime(icu_stays_selected_df['INTIME'])

admissions_selected_df.sort_values(by=[patient_id_col, time_col], inplace=True)
icu_stays_selected_df.sort_values(by=[patient_id_col, 'INTIME'], inplace=True)
procedures_selected_df.sort_values(by=[patient_id_col], inplace=True)
prescriptions_selected_df.sort_values(by=[patient_id_col], inplace=True)
patients_metadata_selected_df.sort_values(by=[patient_id_col], inplace=True)


In [40]:
admissions_selected_df.to_csv(f"{OUTPUT_DIR}/admissions_diagnoses.csv", index=False)
icu_stays_selected_df.to_csv(f"{OUTPUT_DIR}/icu_stays.csv", index=False)
procedures_selected_df.to_csv(f"{OUTPUT_DIR}/procedures.csv", index=False)
prescriptions_selected_df.to_csv(f"{OUTPUT_DIR}/prescriptions.csv", index=False)
patients_metadata_selected_df.to_csv(f"{OUTPUT_DIR}/patients.csv", index=False)