In [5]:
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import yaml

import pandas as pd
import numpy as np

with open((impresources.files(configs) / 'data_config.yaml')) as f:
    config = yaml.safe_load(f)

In [6]:
print(config)

{'dataset': {'mimic': {'path': '/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-dataset', 'columns_to_load': {'admission': ['HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE', 'INSURANCE', 'ETHNICITY', 'DISCHARGE_LOCATION'], 'patient': ['SUBJECT_ID', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP'], 'procedure': ['ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE'], 'prescription': ['HADM_ID', 'SUBJECT_ID', 'DRUG']}}, 'relapse': {'path': '/workspaces/msc-thesis-recurrent-health-modeling/data/avh-data', 'drop_no_showedup': True, 'min_num_test_days': 3}}, 'training_data': {'relapse': {'preprocessed_path': '/workspaces/msc-thesis-recurrent-health-modeling/data/avh-data-preprocessed', 'log_cols_to_add': ['TIME_SINCE_LAST_NEGATIVE', 'TIME_SINCE_LAST_POSITIVE', 'PARTICIPATION_DAYS', 'TIME_UNTIL_NEXT_POSITIVE'], 'cols_to_add_start_relapse': ['PARTICIPATION_DAYS', 'DRUGS_TESTED', 'DRUG_POSITIVE_PAST_MEAN', 'DRUG_POSITIVE_PAST_SUM', 'NUM_POSITIVES_SINCE_LAST_NEGATIVE', 'TIME_SINCE_LAST_NEGATIVE', 'T

## Data Extractor

In [7]:
from recurrent_health_events_prediction.data_extraction.DataExtractor import DataExtractorMIMIC
from recurrent_health_events_prediction.data_extraction.data_types import DiseaseType

dataset_config = config['dataset']['mimic']

print(f"Dataset path: {dataset_config['path']}")

selected_diseases = [DiseaseType.CHRONIC_PULMONARY_DISEASE, DiseaseType.CONGESTIVE_HEART_FAILURE]

# Load the dataset
data_extractor_mimic = DataExtractorMIMIC(dataset_config, selected_diseases)
data_extractor_mimic.load_data()

Dataset path: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-dataset


  prescriptions_df = pd.read_csv(self.data_path + '/PRESCRIPTIONS.csv')


In [None]:
from recurrent_health_events_prediction.data_extraction.DataExtractor import DataExtractorDrugRelapse

dataset_config = config['dataset']['relapse']

# Load the dataset
data_extractor_drug_relapse = DataExtractorDrugRelapse(dataset_config)
donor_df = data_extractor_drug_relapse.get_donor_df()
drug_tests_df = data_extractor_drug_relapse.get_drug_tests_df()

In [4]:
drug_tests_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10590672 entries, 0 to 15629777
Data columns (total 7 columns):
 #   Column              Dtype         
---  ------              -----         
 0   donor_id            int64         
 1   drug_class          object        
 2   time                datetime64[ns]
 3   drug_test_positive  bool          
 4   collection_id       int64         
 5   showedup            bool          
 6   program_type        object        
dtypes: bool(2), datetime64[ns](1), int64(2), object(2)
memory usage: 505.0+ MB


In [6]:
drug_tests_df.donor_id.unique()

array([6500262, 3070213, 9415201, ..., 9343138, 1572156, 1145793],
      shape=(63199,))

### MIMIC dataframes

In [9]:
admissions_df = data_extractor_mimic.get_admissions_df()
admissions_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ADMITTIME,DISCHTIME,ADMISSION_TYPE,INSURANCE,ETHNICITY,DISCHARGE_LOCATION,SHORT_TITLE,LONG_TITLE,COMORBIDITY
0,1297,109,172335,1.0,40301,2141-09-18 10:32:00,2141-09-24 13:53:00,EMERGENCY,Medicaid,BLACK/AFRICAN AMERICAN,HOME HEALTH CARE,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant...",renal_disease
1,1298,109,172335,2.0,486,2141-09-18 10:32:00,2141-09-24 13:53:00,EMERGENCY,Medicaid,BLACK/AFRICAN AMERICAN,HOME HEALTH CARE,"Pneumonia, organism NOS","Pneumonia, organism unspecified",other
2,1299,109,172335,3.0,58281,2141-09-18 10:32:00,2141-09-24 13:53:00,EMERGENCY,Medicaid,BLACK/AFRICAN AMERICAN,HOME HEALTH CARE,Chr nephritis in oth dis,Chronic glomerulonephritis in diseases classif...,renal_disease
3,1300,109,172335,4.0,5855,2141-09-18 10:32:00,2141-09-24 13:53:00,EMERGENCY,Medicaid,BLACK/AFRICAN AMERICAN,HOME HEALTH CARE,Chron kidney dis stage V,"Chronic kidney disease, Stage V",renal_disease
4,1301,109,172335,5.0,4254,2141-09-18 10:32:00,2141-09-24 13:53:00,EMERGENCY,Medicaid,BLACK/AFRICAN AMERICAN,HOME HEALTH CARE,Prim cardiomyopathy NEC,Other primary cardiomyopathies,congestive_heart_failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634681,639774,97488,161999,19.0,7843,2128-08-27 15:01:00,2128-09-04 15:30:00,EMERGENCY,Medicare,WHITE,DEAD/EXPIRED,Aphasia,Aphasia,other
634682,639776,97488,161999,21.0,30391,2128-08-27 15:01:00,2128-09-04 15:30:00,EMERGENCY,Medicare,WHITE,DEAD/EXPIRED,Alcoh dep NEC/NOS-contin,"Other and unspecified alcohol dependence, cont...",other
634683,639777,97488,161999,22.0,E8798,2128-08-27 15:01:00,2128-09-04 15:30:00,EMERGENCY,Medicare,WHITE,DEAD/EXPIRED,Abn react-procedure NEC,Other specified procedures as the cause of abn...,other
634684,639778,97488,161999,23.0,78791,2128-08-27 15:01:00,2128-09-04 15:30:00,EMERGENCY,Medicare,WHITE,DEAD/EXPIRED,Diarrhea,Diarrhea,other


In [10]:
icu_df = data_extractor_mimic.get_icu_stays_df()
icu_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DBSOURCE,FIRST_CAREUNIT,LAST_CAREUNIT,FIRST_WARDID,LAST_WARDID,INTIME,OUTTIME,LOS
0,365,268,110404,280836,carevue,MICU,MICU,52,52,2198-02-14 23:27:38,2198-02-18 05:26:11,3.249
2,367,270,188028,220345,carevue,CCU,CCU,57,57,2128-06-24 15:05:20,2128-06-27 12:32:29,2.8939
5,370,273,158689,241507,carevue,MICU,MICU,52,52,2141-04-19 06:12:05,2141-04-20 17:52:11,1.4862
12,377,281,111199,257572,carevue,MICU,MICU,52,52,2101-10-18 04:45:22,2101-10-25 22:29:25,7.7389
14,379,283,109185,231490,carevue,MICU,MICU,15,15,2166-08-12 22:03:26,2166-09-12 14:41:42,30.6932


In [11]:
patients_metadata_df = data_extractor_mimic.get_patients_df()
patients_metadata_df.head()

Unnamed: 0,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP
0,249,F,2075-03-13,NaT,NaT
4,253,F,2089-11-26,NaT,NaT
6,256,M,2086-07-31,NaT,NaT
10,261,M,2025-08-04,2102-06-29,2102-06-29
11,262,M,2090-01-05,NaT,NaT


In [12]:
prescriptions_df = data_extractor_mimic.get_prescriptions_df()
prescriptions_df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DRUG
75,150750,9,SW
76,150750,9,Labetalol HCl
77,150750,9,Potassium Chloride
78,150750,9,Potassium Chloride
79,150750,9,D5W


In [13]:
procedures_df = data_extractor_mimic.get_procedures_df()
procedures_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,944,62641,154460,3,3404,Insert intercostal cath,Insertion of intercostal catheter for drainage
1,945,2592,130856,1,9671,Cont inv mec ven <96 hrs,Continuous invasive mechanical ventilation for...
2,946,2592,130856,2,3893,Venous cath NEC,"Venous catheterization, not elsewhere classified"
3,947,55357,119355,1,9672,Cont inv mec ven 96+ hrs,Continuous invasive mechanical ventilation for...
4,948,55357,119355,2,331,Spinal tap,Spinal tap


#### Testing Consistency of Subject ID and ADM_ID

In [12]:
set(icu_df["SUBJECT_ID"].unique()) - set(admissions_df["SUBJECT_ID"].unique())

set()

In [13]:
set(icu_df["HADM_ID"].unique()) - set(admissions_df["HADM_ID"].unique())

{np.int64(176570)}

In [14]:
set(prescriptions_df["HADM_ID"].unique()) - set(admissions_df["HADM_ID"].unique())

set()

In [15]:
set(patients_metadata_df["SUBJECT_ID"].unique()) - set(admissions_df["SUBJECT_ID"].unique())

set()

In [16]:
set(procedures_df["HADM_ID"].unique()) - set(admissions_df["HADM_ID"].unique())

set()

## Feature Extractor

### MIMIC Dataset

In [17]:
from recurrent_health_events_prediction.preprocessing.feature_extraction import FeatureExtractorMIMIC

X = FeatureExtractorMIMIC.build_features(
    admissions_df,
    icu_df,
    prescriptions_df,
    procedures_df,
    patients_metadata_df,
    readmission_time_labels=['0-30', '30-120', '120+'],
    bins=[0, 30, 120]
)


In [19]:
X.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE',
       'ETHNICITY', 'INSURANCE', 'HOSPITALIZATION_DAYS', 'NUM_COMORBIDITIES',
       'TYPES_COMORBIDITIES', 'HAS_DIABETES', 'HAS_COPD', 'HAS_CONGESTIVE_HF',
       'NEXT_ADMISSION_TYPE', 'NUM_PREV_HOSPITALIZATIONS', 'PREV_DISCHTIME',
       'NEXT_ADMITTIME', 'DAYS_SINCE_LAST_HOSPITALIZATION',
       'DAYS_UNTIL_NEXT_HOSPITALIZATION', 'PREV_READMISSION_30_DAYS',
       'READMISSION_30_DAYS', 'READMISSION_TIME_CAT',
       'READMISSION_TIME_CAT_ENCODED', 'READM_30_DAYS_PAST_MEAN',
       'READM_30_DAYS_PAST_SUM', 'DAYS_UNTIL_NEXT_HOSP_PAST_MEAN',
       'DAYS_UNTIL_NEXT_HOSP_PAST_MEDIAN', 'DAYS_UNTIL_NEXT_HOSP_PAST_STD',
       'TOTAL_HOSPITALIZATIONS', 'DAYS_IN_ICU', 'NUM_DRUGS', 'NUM_PROCEDURES',
       'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'AGE', 'CHARLSON_INDEX',
       'FIRST_ADMITTIME', 'LAST_DISCHTIME', 'PARTICIPATION_DAYS',
       'TOTAL_PARTICIPATION_DAYS'],
      dtype='object')

In [30]:
X["START"] = X["DISCHTIME"]
X["END"] = X["NEXT_ADMITTIME"]
X["DURATION"] = (X["END"] - X["START"]).dt.total_seconds() / (24 * 3600)  # Convert to days
X["SUBJECT_ID"] = X["SUBJECT_ID"].copy()
X["EVENT_ID"] = X["SUBJECT_ID"].copy()

In [31]:
X[["SUBJECT_ID", "START", "END", "DURATION"]]

Unnamed: 0,SUBJECT_ID,START,END,DURATION
0,111,2142-05-05 11:45:00,2144-07-01 04:12:00,787.685417
1,111,2144-07-01 14:55:00,NaT,
2,124,2160-07-15 15:10:00,2161-12-17 03:39:00,519.520139
3,124,2161-12-24 15:35:00,2165-05-21 21:02:00,1244.227083
4,124,2165-06-06 16:00:00,2165-12-31 18:55:00,208.121528
...,...,...,...,...
2248,99556,2167-07-31 21:53:00,NaT,
2249,99613,2152-11-14 15:43:00,2153-02-10 01:00:00,87.386806
2250,99613,2153-02-26 17:00:00,NaT,
2251,99747,2103-11-30 13:53:00,NaT,


In [33]:
 # Split the data into historical and last event dataframes
(X.groupby("SUBJECT_ID")["END"].transform('max') == X["END"]) | (X["END"].isna())

0        True
1        True
2       False
3       False
4        True
        ...  
2248     True
2249     True
2250     True
2251     True
2252     True
Name: END, Length: 2253, dtype: bool

In [22]:
cols_to_not_consider = [
    "HADM_ID",
    "AGE"
]
cols = [col for col in X.columns if col not in cols_to_not_consider]
X[cols].select_dtypes(include='number').groupby("SUBJECT_ID").mean().columns

Index(['HOSPITALIZATION_DAYS', 'NUM_DIAGNOSES', 'NUM_PREV_HOSPITALIZATIONS',
       'DAYS_SINCE_LAST_HOSPITALIZATION', 'DAYS_IN_ICU', 'NUM_DRUGS',
       'NUM_PROCEDURES'],
      dtype='object')

In [16]:
mask = (X.AGE >= 90)
X[mask][["SUBJECT_ID", "AGE"]].describe()

Unnamed: 0,SUBJECT_ID,AGE
count,153.0,153.0
mean,29889.163399,301.300654
std,25607.339233,1.732374
min,368.0,300.0
25%,14269.0,300.0
50%,18082.0,300.0
75%,41311.0,302.0
max,96793.0,306.0


In [16]:
X['SUBJECT_ID'].nunique()

1068

In [11]:
X.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE',
       'HOSPITALIZATION_DAYS', 'NUM_DIAGNOSES', 'TYPES_COMORBIDITIES',
       'NUM_PREV_HOSPITALIZATIONS', 'DAYS_SINCE_LAST_HOSPITALIZATION',
       'DAYS_IN_ICU', 'NUM_DRUGS', 'NUM_PROCEDURES'],
      dtype='object')

In [12]:
import plotly.express as px

def plot_patient_hospitalizations(df, subject_id):
    # Filter data
    patient_df = df[df['SUBJECT_ID'] == subject_id].copy()
    
    # Sort by admission time
    patient_df = patient_df.sort_values('ADMITTIME')
    
    # Optional: Add an event label
    patient_df['EVENT'] = 'Hospitalization #' + patient_df.groupby('SUBJECT_ID').cumcount().add(1).astype(str)
    
    # Plot Gantt chart
    fig = px.timeline(
        patient_df,
        x_start="ADMITTIME",
        x_end="DISCHTIME",
        y="EVENT",
        color="ADMISSION_TYPE",  # Optional: color by type
        hover_data=["HADM_ID", "NUM_DIAGNOSES", "DAYS_IN_ICU", "NUM_DRUGS"]
    )
    fig.update_yaxes(autorange="reversed")  # Make earlier hospitalizations appear higher
    fig.update_layout(title=f"Hospitalizations Timeline for SUBJECT_ID {subject_id}")
    fig.show()


In [18]:
plot_patient_hospitalizations(X, subject_id=124)

In [16]:
import plotly.express as px
import pandas as pd

def format_feature_value(value):
    """
    Format the feature value for display.
    This function can be customized based on the feature type.
    """
    if isinstance(value, float):
        return f"{value:.2f}"
    else:
        return str(value)

def plot_subject_evolution(df, subject_id, plot_img=False, img_filename="timeline.png"):
    # Filter for the patient
    patient_df = df[df['SUBJECT_ID'] == subject_id].copy()
    patient_df = patient_df.sort_values('ADMITTIME')

    # Features to track over time (excluding ID/time columns)
    features_to_plot = [
        'HOSPITALIZATION_DAYS', 'NUM_DIAGNOSES', 'TYPES_COMORBIDITIES',
        'NUM_PREV_HOSPITALIZATIONS', 'DAYS_SINCE_LAST_HOSPITALIZATION',
        'DAYS_IN_ICU', 'NUM_DRUGS', 'NUM_PROCEDURES'
    ]

    # Melt the data so each feature is a row
    melted = patient_df.melt(
        id_vars=['ADMITTIME', 'ADMISSION_TYPE', 'DISCHTIME'],
        value_vars=features_to_plot,
        var_name='Feature',
        value_name='Value'
    )

    # Convert all values to string for display
    melted['Value'] = melted['Value'].apply(format_feature_value)

    # Define colors for admission types
    color_discrete_map = {
        "URGENT": "red",
        "EMERGENCY": "orange",
        "ELECTIVE": "green"
    }

    # Create the plot
    fig = px.timeline(
        melted,
        x_start='ADMITTIME',
        x_end='DISCHTIME',
        y='Feature',
        color='ADMISSION_TYPE',
        text='Value',
        title=f'Evolution of SUBJECT_ID {subject_id}',
        color_discrete_map=color_discrete_map
    )
    fig.update_traces(textposition='outside', textfont_size=10)
    # Adjust layout to avoid text being cut off
    fig.update_layout(
        height=600,
        yaxis_title='Feature',
        xaxis_title='Time',
        margin=dict(l=100, r=100, t=50, b=50),
        xaxis=dict(range=[melted['ADMITTIME'].min() - pd.Timedelta(days=100), melted['DISCHTIME'].max() + pd.Timedelta(days=730)])
    )

    if plot_img:
        # Save the plot as a PNG image
        fig.write_image(img_filename)
        print(f"Timeline saved as {img_filename}")

    fig.show()



In [18]:
plot_subject_evolution(X, subject_id=124)

In [26]:
X

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,HOSPITALIZATION_DAYS,NUM_DIAGNOSES,TYPES_COMORBIDITIES,NUM_PREV_HOSPITALIZATIONS,DAYS_SINCE_LAST_HOSPITALIZATION,DAYS_IN_ICU,NUM_DRUGS,NUM_PROCEDURES
0,111,192123,2142-04-24 06:55:00,2142-05-05 11:45:00,EMERGENCY,11.201389,11,"[other, copd]",0,,10.570833,57.0,6.0
1,111,155897,2144-07-01 04:12:00,2144-07-01 14:55:00,EMERGENCY,0.446528,12,[other],1,787.685417,0.571204,21.0,4.0
2,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,EMERGENCY,20.739583,9,[other],0,,3.911852,38.0,4.0
3,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,EMERGENCY,7.497222,9,[other],1,519.520139,7.189676,37.0,4.0
4,124,134369,2165-05-21 21:02:00,2165-06-06 16:00:00,ELECTIVE,15.790278,15,[other],2,1244.227083,1.336204,42.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2248,99556,196292,2167-07-30 20:33:00,2167-07-31 21:53:00,EMERGENCY,1.055556,19,"[other, copd]",2,161.068056,0.899039,20.0,1.0
2249,99613,175391,2152-11-03 18:12:00,2152-11-14 15:43:00,EMERGENCY,10.896528,11,[other],0,,1.840139,40.0,1.0
2250,99613,177517,2153-02-10 01:00:00,2153-02-26 17:00:00,EMERGENCY,16.666667,15,"[other, copd]",1,87.386806,16.676794,61.0,8.0
2251,99747,136052,2103-11-23 19:32:00,2103-11-30 13:53:00,EMERGENCY,6.764583,19,"[other, copd]",0,,4.039919,36.0,
