In [1]:
import yaml
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import pandas as pd
import numpy as np
import os

from recurrent_health_events_prediction.training.utils_survival import set_observation_window


with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

# Filter MIMIC Dataset

In [4]:
training_data_config = data_config["training_data"]["mimic"]
training_data_path = training_data_config["preprocessed_output_path_1st_round"]

In [5]:
last_events_file_path = training_data_path + "/last_events.csv"
print(f"Using file for last events: {last_events_file_path}")

Using file for last events: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/last_events.csv


In [6]:
all_events_file_path = training_data_path + "/all_events.csv"
print(f"Using file for all events: {all_events_file_path}")

Using file for all events: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/all_events.csv


In [7]:
historical_events_file_path = training_data_path + "/historical_events.csv"
print(f"Using file for historical events: {historical_events_file_path}")

Using file for historical events: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/historical_events.csv


In [8]:
last_events_df = pd.read_csv(last_events_file_path)
all_events_df = pd.read_csv(all_events_file_path)
historical_events_df = pd.read_csv(historical_events_file_path)

In [9]:
last_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13533 entries, 0 to 13532
Data columns (total 57 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SUBJECT_ID                            13533 non-null  int64  
 1   HADM_ID                               13533 non-null  int64  
 2   ADMITTIME                             13533 non-null  object 
 3   DISCHTIME                             13533 non-null  object 
 4   ADMISSION_TYPE                        13533 non-null  object 
 5   ETHNICITY                             13533 non-null  object 
 6   DISCHARGE_LOCATION                    13533 non-null  object 
 7   INSURANCE                             13533 non-null  object 
 8   HOSPITALIZATION_DAYS                  13533 non-null  float64
 9   NUM_COMORBIDITIES                     13533 non-null  int64  
 10  TYPES_COMORBIDITIES                   13533 non-null  object 
 11  HAS_DIABETES   

In [10]:
last_events_df[['SUBJECT_ID', 'NUM_PREV_HOSPITALIZATIONS', 'TOTAL_HOSPITALIZATIONS', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'READMISSION_30_DAYS']].describe()

Unnamed: 0,SUBJECT_ID,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS
count,13533.0,13533.0,13533.0,4988.0,13533.0
mean,38340.865218,0.300968,1.669549,4.928037,0.086529
std,29337.614248,1.152025,1.392843,1.859013,0.281154
min,6.0,0.0,1.0,0.695228,0.0
25%,14054.0,0.0,1.0,3.530803,0.0
50%,27997.0,0.0,1.0,5.094089,0.0
75%,62664.0,0.0,2.0,6.455088,0.0
max,99991.0,40.0,42.0,8.320927,1.0


In [11]:
historical_events_df[['SUBJECT_ID', 'NUM_PREV_HOSPITALIZATIONS', 'TOTAL_HOSPITALIZATIONS', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'READMISSION_30_DAYS']].describe()

Unnamed: 0,SUBJECT_ID,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS
count,4073.0,4073.0,4073.0,4073.0,4073.0
mean,26962.4137,1.855144,6.710287,4.710349,0.246501
std,24423.716828,3.638656,5.86698,1.70839,0.431027
min,36.0,0.0,3.0,0.707283,0.0
25%,10635.0,0.0,4.0,3.44714,0.0
50%,19620.0,1.0,5.0,4.803264,0.0
75%,29866.0,2.0,7.0,6.072342,0.0
max,99982.0,39.0,42.0,8.252735,1.0


In [12]:
all_events_df[['SUBJECT_ID', 'NUM_PREV_HOSPITALIZATIONS', 'TOTAL_HOSPITALIZATIONS', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'READMISSION_30_DAYS']].describe()

Unnamed: 0,SUBJECT_ID,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS
count,17606.0,17606.0,17606.0,9061.0,17606.0
mean,35708.556174,0.660513,2.835681,4.830184,0.123537
std,28680.425933,2.124149,3.737827,1.796043,0.329062
min,6.0,0.0,1.0,0.695228,0.0
25%,12912.5,0.0,1.0,3.488903,0.0
50%,25416.5,0.0,2.0,4.940749,0.0
75%,57764.5,0.0,3.0,6.295483,0.0
max,99991.0,40.0,42.0,8.320927,1.0


In [13]:
output_path = training_data_path + "/multiple_hosp_patients"
#output_path = training_data_path + "/mimic_cleaned_v2"
print(f"Saving output to: {output_path}")

Saving output to: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients


In [14]:
os.makedirs(output_path, exist_ok=True)

In [15]:
EXCLUDE_ELECTIVE = False
SELECT_ONLY_ONE_EVENT_PER_PATIENT = True
SET_OBSERVATION_WINDOW_SURVIVAL = True  # set observation window for survival analysis
OBSERVATION_WINDOW = 120  # days
SELECT_PATIENTS_WITH_MORE_PREV_HOSP = True  # select patients with more than MIN_PREV_HOSP hospitalizations
MIN_PREV_HOSP = 1  # minimum number of prev hospitalizations for a patient to be selected

output_file_last_events = os.path.join(output_path, "last_events.csv")

In [16]:
last_events_df["AFTER_HOSP_DEATH_EVENT"].value_counts().sort_index()

AFTER_HOSP_DEATH_EVENT
0    11543
1     1990
Name: count, dtype: int64

## Filter Last Events

In [19]:
print(f"Number of last events: {len(last_events_df)}")
print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events: 13533
Number of unique patients in last events:  13533


In [18]:
if EXCLUDE_ELECTIVE:
    last_events_df = last_events_df[last_events_df["NEXT_ADMISSION_TYPE"] != "ELECTIVE"]
    last_events_df = last_events_df[last_events_df["ADMISSION_TYPE"] != "ELECTIVE"]
    
    print(f"Number of last events after excluding elective admissions: {len(last_events_df)}")
    print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

In [20]:
if SELECT_ONLY_ONE_EVENT_PER_PATIENT:
    # Select only the first event for each subject
    # This is done to avoid multiple events for the same subject in the training data
    # which can lead to overfitting and biased predictions.
    last_events_df = last_events_df.sort_values(['SUBJECT_ID', 'ADMITTIME']).groupby('SUBJECT_ID', as_index=False).first()
    
    print(f"Number of last events: {len(last_events_df)}")
    print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events: 13533
Number of unique patients in last events:  13533


In [21]:
if SELECT_PATIENTS_WITH_MORE_PREV_HOSP:
    last_events_df = last_events_df[last_events_df["NUM_PREV_HOSPITALIZATIONS"] >= MIN_PREV_HOSP]
    
    print(f"Number of last events after selection with mult. prev hosp.: {len(last_events_df)}")
    print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events after selection with mult. prev hosp.: 1878
Number of unique patients in last events:  1878


In [22]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    last_events_df = last_events_df.apply(set_observation_window, args=(OBSERVATION_WINDOW,), axis=1)

In [23]:
last_events_df[["NUM_PREV_HOSPITALIZATIONS", "TOTAL_HOSPITALIZATIONS", "LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION", "READMISSION_30_DAYS", "EVENT_DURATION"]].describe()

Unnamed: 0,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS,EVENT_DURATION
count,1878.0,1878.0,1878.0,1878.0,1878.0
mean,2.168797,4.168797,4.657916,0.260383,78.610586
std,2.348385,2.348385,1.72247,0.438961,46.827454
min,1.0,3.0,0.695228,0.0,1.004167
25%,1.0,3.0,3.378038,0.0,28.313194
50%,1.0,3.0,4.754864,0.0,115.147917
75%,2.0,4.0,6.09652,1.0,120.0
max,40.0,42.0,8.210691,1.0,120.0


In [24]:
last_events_df.to_csv(output_file_last_events, index=False)

In [25]:
all_events_df.set_index("HADM_ID")

Unnamed: 0_level_0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,...,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_PARTICIPATION_DAYS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
107064,6,2175-05-30 07:15:00,2175-06-15 16:00:00,ELECTIVE,WHITE,HOME HEALTH CARE,Medicare,16.364583,2,"['other', 'renal_disease']",...,0,2.854433,1.541783,0.000000,2.833213,3.637586,1,0,120+,2
109451,21,2134-09-11 12:17:00,2134-09-24 16:15:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,13.165278,6,"['myocardial_infarct', 'other', 'cerebrovascul...",...,0,2.650794,1.931770,0.000000,2.639057,3.970292,1,0,120+,2
197661,26,2126-05-06 15:16:00,2126-05-13 15:00:00,EMERGENCY,UNKNOWN/NOT SPECIFIED,HOME,Medicare,6.988889,3,"['other', 'congestive_heart_failure', 'myocard...",...,1,2.078052,1.144440,0.000000,1.945910,3.433987,1,0,120+,2
162569,28,2177-09-01 07:15:00,2177-09-06 16:00:00,ELECTIVE,WHITE,HOME HEALTH CARE,Medicare,5.364583,4,"['other', 'diabetes_without_cc', 'myocardial_i...",...,0,1.850749,0.752551,0.000000,1.791759,3.806662,1,0,120+,2
104557,30,2172-10-14 14:17:00,2172-10-19 14:37:00,URGENT,UNKNOWN/NOT SPECIFIED,HOME HEALTH CARE,Medicare,5.013889,2,"['other', 'congestive_heart_failure']",...,0,1.794072,1.046344,0.000000,1.791759,0.000000,1,0,120+,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159023,99939,2110-03-22 04:03:00,2110-04-02 15:02:00,EMERGENCY,HISPANIC OR LATINO,HOME,Medicaid,11.457639,2,"['other', 'renal_disease']",...,0,2.522334,1.355307,0.000000,2.484907,3.433987,1,0,120+,2
101083,99965,2191-07-13 19:39:00,2191-07-16 12:08:00,EMERGENCY,WHITE,HOME,Medicare,2.686806,3,"['other', 'malignant_cancer', 'chronic_pulmona...",...,0,1.304760,1.002914,0.000000,1.098612,2.397895,1,0,120+,2
151454,99982,2156-11-28 11:56:00,2156-12-08 13:45:00,EMERGENCY,WHITE,HOME HEALTH CARE,Medicare,10.075694,2,"['other', 'congestive_heart_failure']",...,0,2.404753,2.191571,0.000000,2.397895,3.850148,0,1,0-30,0
112748,99982,2157-01-05 17:27:00,2157-01-12 13:00:00,EMERGENCY,WHITE,HOME,Medicare,6.814583,3,"['other', 'congestive_heart_failure', 'chronic...",...,0,2.055992,1.782165,0.693147,3.828641,3.295837,1,0,30-120,1


## Filter All Events

In [26]:
output_file_all_events = os.path.join(output_path, "all_events.csv")

In [27]:
print(f"Number of all events before selecting events up to last event: {len(all_events_df)}")
print(f"Number of unique patients in all events: {all_events_df['SUBJECT_ID'].nunique()}")

Number of all events before selecting events up to last event: 17606
Number of unique patients in all events: 13533


In [28]:
from recurrent_health_events_prediction.preprocessing.utils import get_rows_up_to_event_id

last_event_ids = last_events_df.set_index("SUBJECT_ID")["HADM_ID"]
all_events_df = get_rows_up_to_event_id(all_events_df, event_ids=last_event_ids, event_id_col="HADM_ID", include_event_id=True)

  return df.groupby(id_col, group_keys=False).apply(truncate_group)


In [29]:
print(f"Number of all events after selecting events up to last event: {len(all_events_df)}")
print(f"Number of unique patients in all events: {all_events_df['SUBJECT_ID'].nunique()}")

Number of all events after selecting events up to last event: 5951
Number of unique patients in all events: 1878


In [30]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    all_events_df = all_events_df.apply(set_observation_window, args=(OBSERVATION_WINDOW,), axis=1)

In [31]:
all_events_df.to_csv(output_file_all_events, index=False)

## Filter Historical Events

In [32]:
output_file_historical_events = os.path.join(output_path, "historical_events.csv")

In [33]:
print(f"Number of events in historical events: {len(historical_events_df)}")
print(f"Number of unique patients in historical events: {historical_events_df['SUBJECT_ID'].nunique()}")

Number of events in historical events: 4073
Number of unique patients in historical events: 1878


In [31]:
historical_events_df.to_csv(output_file_historical_events, index=False)

## Metadata

In [34]:
control_variables = {
    "EXCLUDE_ELECTIVE": EXCLUDE_ELECTIVE,
    "SELECT_ONLY_ONE_EVENT_PER_PATIENT": SELECT_ONLY_ONE_EVENT_PER_PATIENT,
    "SET_OBSERVATION_WINDOW_SURVIVAL": SET_OBSERVATION_WINDOW_SURVIVAL,
    "OBSERVATION_WINDOW": OBSERVATION_WINDOW,
    "SELECT_PATIENTS_WITH_MORE_PREV_HOSP": SELECT_PATIENTS_WITH_MORE_PREV_HOSP,
    "MIN_PREV_HOSP": MIN_PREV_HOSP
}

output_file_controls = os.path.join(output_path, "control_variables.yaml")

with open(output_file_controls, 'w') as f:
    yaml.dump(control_variables, f)

print(f"Control variables exported to: {output_file_controls}")

Control variables exported to: /workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/control_variables.yaml


### Patients with Only One Hospital Admission Case NA-Treatment

In [None]:
df = pd.read_csv('/workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/mimic_cleaned/last_events.csv')
df.describe()

In [None]:
df["PREV_READMISSION_30_DAYS"].value_counts()

PREV_READMISSION_30_DAYS
-1    7840
 0    1030
 1     339
Name: count, dtype: int64

In [None]:
df["PREV_READMISSION_30_DAYS"] = np.where(df["PREV_READMISSION_30_DAYS"] == -1, 0, df["PREV_READMISSION_30_DAYS"])

In [None]:
df["PREV_READMISSION_30_DAYS"].value_counts()

PREV_READMISSION_30_DAYS
0    8870
1     339
Name: count, dtype: int64

In [None]:
df.to_csv('/workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/mimic_cleaned/last_events.csv', index=False)

# Filter Relapse Dataset

### Set Filter Parameters

In [None]:
from recurrent_health_events_prediction.data_extraction.data_types import ProgramType


SELECT_ONLY_ONE_EVENT_PER_PATIENT = True
SELECT_PATIENTS_WITH_MORE_PREV_RELAPSES = True  # select patients with more than MIN_PREV_HOSP hospitalizations
MIN_PREV_RELAPSES = 1  # minimum number of prev hospitalizations for a patient to be selecteds
SET_OBSERVATION_WINDOW_SURVIVAL = True  # set observation window for survival analysis
OBSERVATION_WINDOW = 120  # days
USE_SPECIFIC_PROGRAM_TYPE = True  # use specific program type to filter patients
SPECIFIC_PROGRAM_TYPE = ProgramType.PROBATION  # specific program type to filter patients

In [None]:
training_data_config = data_config["training_data"]["relapse"]
training_data_path = training_data_config["preprocessed_path"]
print(f"Training data path for relapse: {training_data_path}")

output_path = os.path.join(training_data_path, "mutiple_relapses_patients_probation_120_days")
os.makedirs(output_path, exist_ok=True)

outpath_filename_last_relapses = os.path.join(output_path, "last_relapses.csv")
outpath_filename_historical_relapses = os.path.join(output_path, "historical_relapses.csv")
output_path_filename_all_relapses = os.path.join(output_path, "all_relapses.csv")
output_path_filename_historical_drug_tests = os.path.join(output_path, "historical_drug_tests.csv")

print(f"Output path: {output_path}")
print("Output files: ")
print(outpath_filename_last_relapses)
print(outpath_filename_historical_relapses)
print(output_path_filename_all_relapses)
print(output_path_filename_historical_drug_tests)

Training data path for relapse: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed
Output path: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days
Output files: 
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/last_relapses.csv
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/historical_relapses.csv
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/all_relapses.csv
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/historical_drug_tests.csv


### Import Data

In [None]:
training_data_config = data_config["training_data"]["relapse"]
training_data_path = training_data_config["preprocessed_path"]

print(f"Training data path for relapse: {training_data_path}")

Training data path for relapse: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed


In [None]:
filename = "last_relapses.csv"
last_relapses_file_path = os.path.join(training_data_path, filename)
print(f"Using file for last relapses: {last_relapses_file_path}")  
last_relapses_df = pd.read_csv(last_relapses_file_path)

print(f"Number of last relapses: {len(last_relapses_df)}")
print("Number of unique patients in last relapses: ", last_relapses_df['DONOR_ID'].nunique())

Using file for last relapses: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/last_relapses.csv
Number of last relapses: 53022
Number of unique patients in last relapses:  53022


In [None]:
filename = "historical_relapses.csv"
historical_relapses_file_path = os.path.join(training_data_path, filename)
print(f"Using file for historical relapses: {historical_relapses_file_path}")  
historical_relapses_df = pd.read_csv(historical_relapses_file_path)

print(f"Number of historical relapses: {len(historical_relapses_df)}")
print("Number of unique patients in historical relapses: ", historical_relapses_df['DONOR_ID'].nunique())

Using file for historical relapses: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/historical_relapses.csv
Number of historical relapses: 57572
Number of unique patients in historical relapses:  23634


In [None]:
filename = "all_relapses.csv"
all_relapses_file_path = os.path.join(training_data_path, filename)
print(f"Using file for all relapses: {all_relapses_file_path}")  
all_relapses_df = pd.read_csv(all_relapses_file_path)

print(f"Number of all relapses: {len(all_relapses_df)}")
print("Number of unique patients in all relapses: ", all_relapses_df['DONOR_ID'].nunique())

Using file for all relapses: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/all_relapses.csv
Number of all relapses: 118359
Number of unique patients in all relapses:  53022


In [None]:
filename = "all_drug_tests.csv"
all_drug_tests_file_path = os.path.join(training_data_path, filename)
print(f"Using file for all drug tests: {all_drug_tests_file_path}")  
all_drug_tests_df = pd.read_csv(all_drug_tests_file_path)

print(f"Number of all drug tests: {len(all_drug_tests_df)}")
print("Number of unique patients in all drug tests: ", all_drug_tests_df['DONOR_ID'].nunique())

Using file for all drug tests: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/all_drug_tests.csv
Number of all drug tests: 1748832
Number of unique patients in all drug tests:  63193


In [None]:
filename = "historical_drug_tests.csv"
historical_drug_tests_file_path = os.path.join(training_data_path, filename)
print(f"Using file for historical drug tests: {historical_drug_tests_file_path}")
historical_drug_tests_df = pd.read_csv(historical_drug_tests_file_path)

print(f"Number of historical drug tests: {len(historical_drug_tests_df)}")
print("Number of unique patients in historical drug tests: ", historical_drug_tests_df['DONOR_ID'].nunique())

Using file for historical drug tests: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/historical_drug_tests.csv
Number of historical drug tests: 871341
Number of unique patients in historical drug tests:  53022


In [None]:
historical_drug_tests_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871341 entries, 0 to 871340
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   DONOR_ID                           871341 non-null  int64  
 1   TIME                               871341 non-null  object 
 2   COLLECTION_ID                      871341 non-null  int64  
 3   DRUG_POSITIVE                      871341 non-null  bool   
 4   SHOWEDUP                           871341 non-null  bool   
 5   PROGRAM_TYPE                       871341 non-null  object 
 6   NUM_DRUGS_TESTED                   871341 non-null  int64  
 7   DRUGS_TESTED                       871341 non-null  object 
 8   POSITIVE_DRUGS                     871341 non-null  object 
 9   NUM_DRUGS_POSITIVE                 871341 non-null  int64  
 10  FIRST_TEST_TIME                    871341 non-null  object 
 11  PARTICIPATION_DAYS                 8713

In [None]:
all_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118359 entries, 0 to 118358
Data columns (total 37 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   DONOR_ID                           118359 non-null  int64  
 1   COLLECTION_ID                      118359 non-null  int64  
 2   RELAPSE_START                      118359 non-null  object 
 3   RELAPSE_END                        118359 non-null  object 
 4   EVENT_DURATION                     118359 non-null  float64
 5   RELAPSE_EVENT                      118359 non-null  int64  
 6   NUM_TESTS_PERIOD                   118359 non-null  int64  
 7   NUM_PREV_RELAPSES                  118359 non-null  int64  
 8   RELAPSE_DURATION_CATEGORY          118359 non-null  object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  118359 non-null  int64  
 10  RELAPSE_30_DAYS                    118359 non-null  int64  
 11  PREV_POSITIVE_DRUGS                1183

### Filter Last Relapses

In [None]:
last_relapses_df.head(10)

Unnamed: 0,DONOR_ID,COLLECTION_ID,RELAPSE_START,RELAPSE_END,EVENT_DURATION,RELAPSE_EVENT,NUM_TESTS_PERIOD,NUM_PREV_RELAPSES,RELAPSE_DURATION_CATEGORY,RELAPSE_DURATION_CATEGORY_ENCODED,...,LOG_PARTICIPATION_DAYS,LOG_TIME_UNTIL_NEXT_POSITIVE,LOG_TIME_RELAPSE_PAST_MEAN,LOG_TIME_RELAPSE_PAST_MEDIAN,LOG_TIME_RELAPSE_PAST_STD,PREV_RELAPSE_30_DAYS,RELAPSE_30_DAYS_PAST_MEAN,RELAPSE_30_DAYS_PAST_SUM,IS_LAST_EVENT,IS_HISTORICAL_EVENT
0,10012,5699966,2018-05-17,2019-04-04,322.0,0,30,0,180+,3,...,7.160846,,7.156956,7.156956,,0,0.0,0.0,1,0
1,10066,1776266,2015-05-25,2016-05-03,344.0,1,31,0,180+,3,...,1.609438,5.843544,,,,0,,,1,0
2,10092,1095853,2014-10-10,2014-12-12,63.0,0,5,0,30-90,1,...,3.465736,,,,,0,,,1,0
3,10095,1591980,2015-03-11,2015-11-19,253.0,0,25,0,180+,3,...,3.401197,,,,,0,,,1,0
4,10136,1272979,2014-12-04,2015-02-10,68.0,0,4,0,30-90,1,...,2.70805,,,,,0,,,1,0
5,10141,2354200,2016-01-29,2016-05-24,116.0,1,6,4,90-180,2,...,5.545177,4.762174,3.107204,3.117205,0.968272,1,0.75,3.0,1,0
6,10173,1826720,2015-06-16,2015-06-21,5.0,1,2,1,0-30,0,...,4.317488,1.791759,2.833213,2.833213,,1,1.0,1.0,1,0
7,10219,2546456,2016-04-15,2016-04-16,1.0,1,1,11,0-30,0,...,6.09357,0.693147,2.603149,2.890372,1.112319,0,0.727273,8.0,1,0
8,10235,1592945,2015-03-11,2015-10-28,231.0,0,14,0,180+,3,...,4.356709,,,,,0,,,1,0
9,10354,1878665,2015-07-10,2016-02-05,210.0,0,6,0,180+,3,...,3.713572,,,,,0,,,1,0


In [None]:
print(f"Initial number of last relapses: {len(last_relapses_df)}")
print("Initial number of unique patients in last relapses: ", last_relapses_df['DONOR_ID'].nunique())

Initial number of last relapses: 53022
Initial number of unique patients in last relapses:  53022


In [None]:
from recurrent_health_events_prediction.preprocessing.utils import filter_select_only_one_program_type

if USE_SPECIFIC_PROGRAM_TYPE:
    print(f"Filtering last relapses for program type: {SPECIFIC_PROGRAM_TYPE}")
    # Filter last relapses for the specific program type
    # This is done to ensure that we only consider patients who are part of the specific program
    last_relapses_df = filter_select_only_one_program_type(last_relapses_df, "PROGRAM_TYPE", SPECIFIC_PROGRAM_TYPE)

    print(f"Number of last relapses after filtering for program type: {len(last_relapses_df)}")
    print("Number of unique patients in last relapses after filtering for program type: ", last_relapses_df['DONOR_ID'].nunique())

Filtering last relapses for program type: ProgramType.PROBATION
Number of last relapses after filtering for program type: 24303
Number of unique patients in last relapses after filtering for program type:  24303


In [None]:
if SELECT_PATIENTS_WITH_MORE_PREV_RELAPSES:
    print("Minimum number of previous relapses for a patient to be selected: ", MIN_PREV_RELAPSES)
    last_relapses_df = last_relapses_df[last_relapses_df["NUM_PREV_RELAPSES"] >= MIN_PREV_RELAPSES]
    print("Minimum number of previous relapses after filtering: ", last_relapses_df['NUM_PREV_RELAPSES'].min())

Minimum number of previous relapses for a patient to be selected:  1
Minimum number of previous relapses after filtering:  1


In [None]:
filter_mask = (last_relapses_df['TIME_SINCE_LAST_POSITIVE'] <= 90)
last_relapses_df = last_relapses_df[filter_mask]
print(f"Number of last relapses after filtering by TIME_SINCE_LAST_POSITIVE <= 90: {len(last_relapses_df)}")
print("Number of unique patients in last relapses after filtering: ", last_relapses_df['DONOR_ID'].nunique())

Number of last relapses after filtering by TIME_SINCE_LAST_POSITIVE <= 90: 7342
Number of unique patients in last relapses after filtering:  7342


In [None]:
donor_ids = last_relapses_df['DONOR_ID'].unique()

all_drug_tests_df = all_drug_tests_df[all_drug_tests_df['DONOR_ID'].isin(donor_ids)]
gap_time_df = all_drug_tests_df.groupby('DONOR_ID').agg(
    MAX_GAP_TIME = ('TIME_UNTIL_NEXT_TEST', "max")
).reset_index()
gap_time_df.head()

Unnamed: 0,DONOR_ID,MAX_GAP_TIME
0,10731,107.0
1,10801,188.0
2,10904,36.0
3,10994,625.0
4,11472,41.0


In [None]:
valid_donors_id_regarding_gap = gap_time_df[gap_time_df['MAX_GAP_TIME'] < 180]['DONOR_ID'].unique()

In [None]:
filter_mask = last_relapses_df.isin(valid_donors_id_regarding_gap)
last_relapses_df = last_relapses_df[last_relapses_df['DONOR_ID'].isin(valid_donors_id_regarding_gap)]
print(f"Number of last relapses after filtering by MAX_GAP_TIME < 180: {len(last_relapses_df)}")
print("Number of unique patients in last relapses after filtering: ", last_relapses_df['DONOR_ID'].nunique())

Number of last relapses after filtering by MAX_GAP_TIME < 180: 5452
Number of unique patients in last relapses after filtering:  5452


In [None]:
last_relapses_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    last_relapses_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    last_relapses_df['PARTICIPATION_DAYS'] + 90,
    last_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

last_relapses_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(last_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

In [None]:
if SELECT_ONLY_ONE_EVENT_PER_PATIENT:
    # Select only the first event for each subject
    # This is done to avoid multiple events for the same subject in the training data
    # which can lead to overfitting and biased predictions.
    print("Selecting only the first event for each patient")
    last_relapses_df = last_relapses_df.sort_values(['DONOR_ID', 'RELAPSE_START']).groupby('DONOR_ID', as_index=False).first()
    print(f"Number of last relapses after selecting just first relapse: {len(last_relapses_df)}")
    print("Number of unique patients in last relapses after selecting just first relapse: ", last_relapses_df['DONOR_ID'].nunique())

Selecting only the first event for each patient
Number of last relapses after selecting just first relapse: 5452
Number of unique patients in last relapses after selecting just first relapse:  5452


In [None]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    print("Setting observation window for survival analysis")
    print("Observation window (days): ", OBSERVATION_WINDOW)
    # Set observation window for survival analysis
    last_relapses_df = last_relapses_df.apply(set_observation_window, args=(OBSERVATION_WINDOW, 'RELAPSE_EVENT'), axis=1)
    print("Number of last relapses after setting observation window: ", len(last_relapses_df))
    print("Number of unique patients in last relapses after setting observation window: ", last_relapses_df['DONOR_ID'].nunique())
    print("Maximum and minimum event duration after setting observation window: ", last_relapses_df['EVENT_DURATION'].max(), last_relapses_df['EVENT_DURATION'].min())
    print("Unique values of RELAPSE_EVENT after setting observation window: ", last_relapses_df['RELAPSE_EVENT'].unique())

Setting observation window for survival analysis
Observation window (days):  120
Number of last relapses after setting observation window:  5452
Number of unique patients in last relapses after setting observation window:  5452
Maximum and minimum event duration after setting observation window:  120.0 1.0
Unique values of RELAPSE_EVENT after setting observation window:  [1 0]


In [None]:
last_relapses_df.EVENT_DURATION.describe()

count    5452.000000
mean       48.069516
std        42.018059
min         1.000000
25%        12.000000
50%        33.000000
75%        82.000000
max       120.000000
Name: EVENT_DURATION, dtype: float64

In [None]:
last_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5452 entries, 0 to 5451
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DONOR_ID                           5452 non-null   int64  
 1   COLLECTION_ID                      5452 non-null   int64  
 2   RELAPSE_START                      5452 non-null   object 
 3   RELAPSE_END                        5452 non-null   object 
 4   EVENT_DURATION                     5452 non-null   float64
 5   RELAPSE_EVENT                      5452 non-null   int64  
 6   NUM_TESTS_PERIOD                   5452 non-null   int64  
 7   NUM_PREV_RELAPSES                  5452 non-null   int64  
 8   RELAPSE_DURATION_CATEGORY          5452 non-null   object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  5452 non-null   int64  
 10  RELAPSE_30_DAYS                    5452 non-null   int64  
 11  PREV_POSITIVE_DRUGS                5452 non-null   objec

In [None]:
print("Exporting last relapses to CSV file: ", outpath_filename_last_relapses)
last_relapses_df.to_csv(outpath_filename_last_relapses, index=False)

Exporting last relapses to CSV file:  /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/last_relapses.csv


### Filter Historical Relapses

In [None]:
donor_ids = last_relapses_df['DONOR_ID'].unique()
historical_relapses_df = historical_relapses_df[historical_relapses_df['DONOR_ID'].isin(donor_ids)]

print(f"Number of historical relapses after filtering by DONOR_ID: {len(historical_relapses_df)}")
print("Number of unique patients in historical relapses after filtering: ", historical_relapses_df['DONOR_ID'].nunique())

Number of historical relapses after filtering by DONOR_ID: 14856
Number of unique patients in historical relapses after filtering:  5452


In [None]:
historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    historical_relapses_df['PARTICIPATION_DAYS'] + 90,
    historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

historical_relapses_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

In [None]:
historical_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14856 entries, 25 to 57571
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DONOR_ID                           14856 non-null  int64  
 1   COLLECTION_ID                      14856 non-null  int64  
 2   RELAPSE_START                      14856 non-null  object 
 3   RELAPSE_END                        14856 non-null  object 
 4   EVENT_DURATION                     14856 non-null  float64
 5   RELAPSE_EVENT                      14856 non-null  int64  
 6   NUM_TESTS_PERIOD                   14856 non-null  int64  
 7   NUM_PREV_RELAPSES                  14856 non-null  int64  
 8   RELAPSE_DURATION_CATEGORY          14856 non-null  object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  14856 non-null  int64  
 10  RELAPSE_30_DAYS                    14856 non-null  int64  
 11  PREV_POSITIVE_DRUGS                14856 non-null  object 

In [None]:
historical_relapses_df.to_csv(outpath_filename_historical_relapses, index=False)

### Filter All Relapses

In [None]:
donor_ids = last_relapses_df['DONOR_ID'].unique()
all_relapses_df = all_relapses_df[all_relapses_df['DONOR_ID'].isin(donor_ids)]

print(f"Number of all relapses after filtering by DONOR_ID: {len(all_relapses_df)}")
print("Number of unique patients in all relapses after filtering: ", all_relapses_df['DONOR_ID'].nunique())

Number of all relapses after filtering by DONOR_ID: 22867
Number of unique patients in all relapses after filtering:  5452


In [None]:
all_relapses_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    all_relapses_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    all_relapses_df['PARTICIPATION_DAYS'] + 90,
    all_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

all_relapses_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(all_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

In [None]:
all_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22867 entries, 40 to 118358
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DONOR_ID                           22867 non-null  int64  
 1   COLLECTION_ID                      22867 non-null  int64  
 2   RELAPSE_START                      22867 non-null  object 
 3   RELAPSE_END                        22867 non-null  object 
 4   EVENT_DURATION                     22867 non-null  float64
 5   RELAPSE_EVENT                      22867 non-null  int64  
 6   NUM_TESTS_PERIOD                   22867 non-null  int64  
 7   NUM_PREV_RELAPSES                  22867 non-null  int64  
 8   RELAPSE_DURATION_CATEGORY          22867 non-null  object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  22867 non-null  int64  
 10  RELAPSE_30_DAYS                    22867 non-null  int64  
 11  PREV_POSITIVE_DRUGS                22867 non-null  object

In [None]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    all_relapses_df = all_relapses_df.apply(set_observation_window, args=(OBSERVATION_WINDOW, 'RELAPSE_EVENT'), axis=1)

In [None]:
print("Maximum and minimum event duration after setting observation window: ", all_relapses_df['EVENT_DURATION'].max(), all_relapses_df['EVENT_DURATION'].min())
print("Unique values of RELAPSE_EVENT after setting observation window: ", all_relapses_df['RELAPSE_EVENT'].unique())

Maximum and minimum event duration after setting observation window:  120.0 1.0
Unique values of RELAPSE_EVENT after setting observation window:  [1 0]


In [None]:
print("Exporting last relapses to CSV file: ", output_path_filename_all_relapses)
all_relapses_df = all_relapses_df.to_csv(output_path_filename_all_relapses, index=False)

Exporting last relapses to CSV file:  /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/all_relapses.csv


### Filter Historical Drug Tests

In [None]:
donor_ids = last_relapses_df['DONOR_ID'].unique()
historical_drug_tests_df = historical_drug_tests_df[historical_drug_tests_df['DONOR_ID'].isin(donor_ids)]

print(f"Number of historical drug tests after filtering by DONOR_ID: {len(historical_drug_tests_df)}")
print("Number of unique patients in historical drug tests after filtering: ", historical_drug_tests_df['DONOR_ID'].nunique())

Number of historical drug tests after filtering by DONOR_ID: 129278
Number of unique patients in historical drug tests after filtering:  5452


In [None]:
historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    historical_drug_tests_df['PARTICIPATION_DAYS'] + 90,
    historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'])


historical_drug_tests_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'])

In [None]:
historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'] = np.where(
    historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'].isna(),
    historical_drug_tests_df['PARTICIPATION_DAYS'] + 30,
    historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'])

historical_drug_tests_df['LOG_TIME_SINCE_LAST_POSITIVE'] = np.log(historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'])

In [None]:
historical_drug_tests_df = historical_drug_tests_df.to_csv(output_path_filename_historical_drug_tests, index=False)

### Metadata

In [None]:
control_variables = {
    "SELECT_ONLY_ONE_EVENT_PER_PATIENT": SELECT_ONLY_ONE_EVENT_PER_PATIENT,
    "SET_OBSERVATION_WINDOW_SURVIVAL": SET_OBSERVATION_WINDOW_SURVIVAL,
    "OBSERVATION_WINDOW": OBSERVATION_WINDOW,
    "SELECT_PATIENTS_WITH_MORE_PREV_HOSP": SELECT_PATIENTS_WITH_MORE_PREV_RELAPSES,
    "MIN_PREV_HOSP": MIN_PREV_RELAPSES
}

output_file_controls = os.path.join(output_path, "control_variables.yaml")

with open(output_file_controls, 'w') as f:
    yaml.dump(control_variables, f)

print(f"Control variables exported to: {output_file_controls}")

Control variables exported to: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/control_variables.yaml
