In [1]:
import yaml
from importlib import resources as impresources
from recurrent_health_events_prediction import configs
import pandas as pd
import numpy as np
import os

from recurrent_health_events_prediction.training.utils_survival import set_observation_window
from recurrent_health_events_prediction.preprocessing.utils import filter_select_only_one_program_type



with open((impresources.files(configs) / 'data_config.yaml')) as f:
    data_config = yaml.safe_load(f)

# Filter MIMIC Dataset

In [2]:
training_data_config = data_config["training_data"]["mimic"]
training_data_path = training_data_config["preprocessed_path"]

In [3]:
last_events_file_path = training_data_path + "/last_events.csv"
print(f"Using file for last events: {last_events_file_path}")

Using file for last events: /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/last_events.csv


In [4]:
all_events_file_path = training_data_path + "/all_events.csv"
print(f"Using file for all events: {all_events_file_path}")

Using file for all events: /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/all_events.csv


In [5]:
historical_events_file_path = training_data_path + "/historical_events.csv"
print(f"Using file for historical events: {historical_events_file_path}")

Using file for historical events: /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/historical_events.csv


In [6]:
last_events_df = pd.read_csv(last_events_file_path)
all_events_df = pd.read_csv(all_events_file_path)
historical_events_df = pd.read_csv(historical_events_file_path)

In [7]:
last_events_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11614 entries, 0 to 11613
Data columns (total 56 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SUBJECT_ID                            11614 non-null  int64  
 1   HADM_ID                               11614 non-null  int64  
 2   ADMITTIME                             11614 non-null  object 
 3   DISCHTIME                             11614 non-null  object 
 4   ADMISSION_TYPE                        11614 non-null  object 
 5   ETHNICITY                             11614 non-null  object 
 6   DISCHARGE_LOCATION                    11614 non-null  object 
 7   INSURANCE                             11614 non-null  object 
 8   HOSPITALIZATION_DAYS                  11614 non-null  float64
 9   NUM_COMORBIDITIES                     11614 non-null  int64  
 10  TYPES_COMORBIDITIES                   11614 non-null  object 
 11  HAS_DIABETES   

In [8]:
last_events_df[['SUBJECT_ID', 'NUM_PREV_HOSPITALIZATIONS', 'TOTAL_HOSPITALIZATIONS', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'READMISSION_30_DAYS']].describe()

Unnamed: 0,SUBJECT_ID,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS
count,11614.0,11614.0,11614.0,4317.0,11614.0
mean,37468.04882,0.290684,1.66239,4.992352,0.082659
std,29183.324972,1.040415,1.296659,1.795378,0.275378
min,21.0,0.0,1.0,1.105533,0.0
25%,13648.75,0.0,1.0,3.618714,0.0
50%,27170.0,0.0,1.0,5.132156,0.0
75%,61159.5,0.0,2.0,6.479482,0.0
max,99991.0,22.0,24.0,8.320927,1.0


In [9]:
historical_events_df[['SUBJECT_ID', 'NUM_PREV_HOSPITALIZATIONS', 'TOTAL_HOSPITALIZATIONS', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'READMISSION_30_DAYS']].describe()

Unnamed: 0,SUBJECT_ID,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS
count,3376.0,3376.0,3376.0,3376.0,3376.0
mean,26390.755924,1.507109,6.014218,4.804943,0.228969
std,24149.449795,2.52254,3.912802,1.6623,0.420232
min,36.0,0.0,3.0,1.103001,0.0
25%,10188.0,0.0,3.0,3.553506,0.0
50%,19363.5,1.0,5.0,4.878766,0.0
75%,29037.0,2.0,7.0,6.155949,0.0
max,99982.0,21.0,24.0,8.252735,1.0


In [10]:
all_events_df[['SUBJECT_ID', 'NUM_PREV_HOSPITALIZATIONS', 'TOTAL_HOSPITALIZATIONS', 'LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION', 'READMISSION_30_DAYS']].describe()

Unnamed: 0,SUBJECT_ID,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS
count,23144.0,23144.0,23144.0,7693.0,23144.0
mean,34346.306818,0.698108,2.396215,4.91011,0.074879
std,28286.571306,1.556074,2.465479,1.740609,0.263202
min,3.0,0.0,1.0,1.103001,0.0
25%,12371.75,0.0,1.0,3.585099,0.0
50%,24348.0,0.0,2.0,5.018199,0.0
75%,55059.0,1.0,3.0,6.336979,0.0
max,99995.0,23.0,24.0,8.320927,1.0


In [11]:
#output_path = training_data_path + "/multiple_hosp_patients"
output_path = training_data_path + "/mimic_cleaned"
print(f"Saving output to: {output_path}")

Saving output to: /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/mimic_cleaned


In [12]:
os.makedirs(output_path, exist_ok=True)

In [13]:
EXCLUDE_ELECTIVE = True
SELECT_ONLY_ONE_EVENT_PER_PATIENT = True
SET_OBSERVATION_WINDOW_SURVIVAL = True  # set observation window for survival analysis
OBSERVATION_WINDOW = 120  # days
SELECT_PATIENTS_WITH_MORE_PREV_HOSP = False  # select patients with more than MIN_PREV_HOSP hospitalizations
MIN_PREV_HOSP = None  # minimum number of prev hospitalizations for a patient to be selecteds

output_file_last_events = os.path.join(output_path, "last_events.csv")

In [14]:
last_events_df["AFTER_HOSP_DEATH_EVENT"].value_counts().sort_index()

AFTER_HOSP_DEATH_EVENT
0    9850
1    1764
Name: count, dtype: int64

## Filter Last Events

In [15]:
print(f"Number of last events after excluding elective admissions: {len(last_events_df)}")
print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events after excluding elective admissions: 11614
Number of unique patients in last events:  11614


In [16]:
if EXCLUDE_ELECTIVE:
    last_events_df = last_events_df[last_events_df["NEXT_ADMISSION_TYPE"] != "ELECTIVE"]
    last_events_df = last_events_df[last_events_df["ADMISSION_TYPE"] != "ELECTIVE"]

In [17]:
print(f"Number of last events after excluding elective admissions: {len(last_events_df)}")
print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events after excluding elective admissions: 9209
Number of unique patients in last events:  9209


In [18]:
if SELECT_ONLY_ONE_EVENT_PER_PATIENT:
    # Select only the first event for each subject
    # This is done to avoid multiple events for the same subject in the training data
    # which can lead to overfitting and biased predictions.
    last_events_df = last_events_df.sort_values(['SUBJECT_ID', 'ADMITTIME']).groupby('SUBJECT_ID', as_index=False).first()

In [19]:
print(f"Number of last events after excluding elective admissions: {len(last_events_df)}")
print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events after excluding elective admissions: 9209
Number of unique patients in last events:  9209


In [20]:
if SELECT_PATIENTS_WITH_MORE_PREV_HOSP:
    last_events_df = last_events_df[last_events_df["NUM_PREV_HOSPITALIZATIONS"] >= MIN_PREV_HOSP]

In [21]:
print(f"Number of last events after excluding elective admissions: {len(last_events_df)}")
print("Number of unique patients in last events: ", last_events_df['SUBJECT_ID'].nunique())

Number of last events after excluding elective admissions: 9209
Number of unique patients in last events:  9209


In [22]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    last_events_df = last_events_df.apply(set_observation_window, args=(OBSERVATION_WINDOW,), axis=1)

In [23]:
last_events_df[["NUM_PREV_HOSPITALIZATIONS", "TOTAL_HOSPITALIZATIONS", "LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION", "READMISSION_30_DAYS", "EVENT_DURATION"]].describe()

Unnamed: 0,NUM_PREV_HOSPITALIZATIONS,TOTAL_HOSPITALIZATIONS,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,READMISSION_30_DAYS,EVENT_DURATION
count,9209.0,9209.0,3353.0,9209.0,9209.0
mean,0.326637,1.690737,4.86934,0.088826,105.839525
std,1.129605,1.386732,1.798787,0.284508,33.709856
min,0.0,1.0,1.105533,0.0,2.020833
25%,0.0,1.0,3.471406,0.0,120.0
50%,0.0,1.0,4.948952,0.0,120.0
75%,0.0,2.0,6.361541,0.0,120.0
max,22.0,24.0,8.320927,1.0,120.0


In [24]:
last_events_df.to_csv(output_file_last_events, index=False)

In [25]:
all_events_df.set_index("HADM_ID")

Unnamed: 0_level_0,SUBJECT_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ETHNICITY,DISCHARGE_LOCATION,INSURANCE,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,...,IN_HOSP_DEATH_EVENT,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT,READMISSION_TIME_CAT_ENCODED
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
145834,3,2101-10-20 19:08:00,2101-10-31 13:58:00,EMERGENCY,WHITE,SNF,Medicare,10.784722,3,"['congestive_heart_failure', 'myocardial_infar...",...,0,1,2.466804,1.955091,0.000000,0.000000,0,0,,-1
150750,9,2149-11-09 13:06:00,2149-11-14 10:15:00,EMERGENCY,UNKNOWN/NOT SPECIFIED,DEAD/EXPIRED,Medicaid,4.881250,3,"['congestive_heart_failure', 'other', 'cerebro...",...,1,0,1.771769,1.844203,0.000000,3.367296,0,0,,-1
109451,21,2134-09-11 12:17:00,2134-09-24 16:15:00,EMERGENCY,WHITE,REHAB/DISTINCT PART HOSP,Medicare,13.165278,6,"['myocardial_infarct', 'other', 'cerebrovascul...",...,0,0,2.650794,1.931770,0.000000,3.970292,1,0,120+,2
111970,21,2135-01-30 20:50:00,2135-02-08 02:08:00,EMERGENCY,WHITE,DEAD/EXPIRED,Medicare,8.220833,6,"['other', 'cerebrovascular_disease', 'diabetes...",...,1,0,2.221465,2.236950,0.693147,3.737670,0,0,,-1
197661,26,2126-05-06 15:16:00,2126-05-13 15:00:00,EMERGENCY,UNKNOWN/NOT SPECIFIED,HOME,Medicare,6.988889,3,"['congestive_heart_failure', 'myocardial_infar...",...,0,1,2.078052,1.144440,0.000000,3.433987,1,0,120+,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151454,99982,2156-11-28 11:56:00,2156-12-08 13:45:00,EMERGENCY,WHITE,HOME HEALTH CARE,Medicare,10.075694,2,"['congestive_heart_failure', 'other']",...,0,0,2.404753,2.191571,0.000000,3.850148,0,1,0-30,0
112748,99982,2157-01-05 17:27:00,2157-01-12 13:00:00,EMERGENCY,WHITE,HOME,Medicare,6.814583,3,"['congestive_heart_failure', 'other', 'chronic...",...,0,0,2.055992,1.782165,0.693147,3.295837,1,0,30-120,1
183791,99982,2157-02-16 17:31:00,2157-02-22 20:36:00,EMERGENCY,WHITE,SHORT TERM HOSPITAL,Medicare,6.128472,2,"['congestive_heart_failure', 'other']",...,0,1,1.964097,1.969920,1.098612,3.688879,0,0,,-1
151118,99991,2184-12-24 08:30:00,2185-01-05 12:15:00,ELECTIVE,WHITE,HOME,Private,12.156250,3,"['congestive_heart_failure', 'other', 'diabete...",...,0,0,2.576897,1.421327,0.000000,3.871201,1,0,120+,2


## Filter All Events

In [26]:
output_file_all_events = os.path.join(output_path, "all_events.csv")

In [27]:
print(f"Number of all events before selecting events up to last event: {len(all_events_df)}")
print(f"Number of unique patients in all events: {all_events_df['SUBJECT_ID'].nunique()}")

Number of all events before selecting events up to last event: 23144
Number of unique patients in all events: 15451


In [28]:
from recurrent_health_events_prediction.preprocessing.utils import get_rows_up_to_event_id

last_event_ids = last_events_df.set_index("SUBJECT_ID")["HADM_ID"]
all_events_df = get_rows_up_to_event_id(all_events_df, event_ids=last_event_ids, event_id_col="HADM_ID", include_event_id=True)

In [29]:
print(f"Number of all events after selecting events up to last event: {len(all_events_df)}")
print(f"Number of unique patients in all events: {all_events_df['SUBJECT_ID'].nunique()}")

Number of all events after selecting events up to last event: 12217
Number of unique patients in all events: 9209


In [30]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    all_events_df = all_events_df.apply(set_observation_window, args=(OBSERVATION_WINDOW,), axis=1)

In [31]:
all_events_df.to_csv(output_file_all_events, index=False)

## Filter Historical Events

In [32]:
output_file_historical_events = os.path.join(output_path, "historical_events.csv")

In [33]:
print(f"Number of events in historical events: {len(historical_events_df)}")
print(f"Number of unique patients in historical events: {historical_events_df['SUBJECT_ID'].nunique()}")

Number of events in historical events: 3376
Number of unique patients in historical events: 1610


In [34]:
historical_events_df.to_csv(output_file_historical_events, index=False)

## Metadata

In [35]:
control_variables = {
    "EXCLUDE_ELECTIVE": EXCLUDE_ELECTIVE,
    "SELECT_ONLY_ONE_EVENT_PER_PATIENT": SELECT_ONLY_ONE_EVENT_PER_PATIENT,
    "SET_OBSERVATION_WINDOW_SURVIVAL": SET_OBSERVATION_WINDOW_SURVIVAL,
    "OBSERVATION_WINDOW": OBSERVATION_WINDOW,
    "SELECT_PATIENTS_WITH_MORE_PREV_HOSP": SELECT_PATIENTS_WITH_MORE_PREV_HOSP,
    "MIN_PREV_HOSP": MIN_PREV_HOSP
}

output_file_controls = os.path.join(output_path, "control_variables.yaml")

with open(output_file_controls, 'w') as f:
    yaml.dump(control_variables, f)

print(f"Control variables exported to: {output_file_controls}")

Control variables exported to: /workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/mimic_cleaned/control_variables.yaml


### Patients with Only One Hospital Admission Case NA-Treatment

In [36]:
df = pd.read_csv('/workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/mimic_cleaned/last_events.csv')
df.describe()

Unnamed: 0,SUBJECT_ID,HADM_ID,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,NUM_PREV_HOSPITALIZATIONS,DAYS_SINCE_LAST_HOSPITALIZATION,DAYS_UNTIL_NEXT_HOSPITALIZATION,LOG_DAYS_SINCE_LAST_HOSPITALIZATION,LOG_DAYS_UNTIL_NEXT_HOSPITALIZATION,PREV_READMISSION_30_DAYS,...,EVENT_DURATION,IN_HOSP_DEATH_EVENT,AFTER_HOSP_DEATH_EVENT,LOG_HOSPITALIZATION_DAYS,LOG_DAYS_IN_ICU,LOG_NUM_PREV_HOSPITALIZATIONS,LOG_NUM_DRUGS,IS_LAST_EVENT,IS_HISTORICAL_EVENT,READMISSION_TIME_CAT_ENCODED
count,9209.0,9209.0,9209.0,9209.0,9209.0,1369.0,3353.0,1369.0,3353.0,9209.0,...,9209.0,9209.0,9209.0,9209.0,9209.0,9209.0,9209.0,9209.0,9209.0,9209.0
mean,37315.122815,150452.692475,10.504397,3.417635,0.326637,381.427651,450.453461,4.802124,4.86934,-0.814529,...,105.839525,0.0,0.163101,2.198467,1.398284,0.15437,3.299533,1.0,0.0,1.738408
std,29165.096143,29087.521876,9.696931,1.240506,1.129605,557.491008,671.88624,1.720247,1.798787,0.474046,...,33.709856,0.0,0.369478,0.680931,0.709908,0.407543,1.078696,0.0,0.0,0.608978
min,21.0,100006.0,0.5,1.0,0.0,2.05625,2.020833,1.117189,1.105533,-1.0,...,2.020833,0.0,0.0,0.405465,0.0,0.0,0.0,1.0,0.0,0.0
25%,13631.0,125240.0,4.748611,2.0,0.0,30.270139,31.181944,3.442664,3.471406,-1.0,...,120.0,0.0,0.0,1.748958,0.841767,0.0,3.218876,1.0,0.0,2.0
50%,26950.0,150583.0,7.816667,3.0,0.0,130.345139,140.027083,4.877829,4.948952,-1.0,...,120.0,0.0,0.0,2.176644,1.239142,0.0,3.555348,1.0,0.0,2.0
75%,61030.0,175657.0,12.895833,4.0,0.0,533.139583,578.138194,6.280657,6.361541,-1.0,...,120.0,0.0,0.0,2.631589,1.782214,0.0,3.912023,1.0,0.0,2.0
max,99982.0,199994.0,161.573611,9.0,22.0,3837.107639,4107.96875,8.252735,8.320927,1.0,...,120.0,0.0,1.0,5.091131,4.475613,3.135494,4.969813,1.0,0.0,2.0


In [37]:
df["PREV_READMISSION_30_DAYS"].value_counts()

PREV_READMISSION_30_DAYS
-1    7840
 0    1030
 1     339
Name: count, dtype: int64

In [38]:
df["PREV_READMISSION_30_DAYS"] = np.where(df["PREV_READMISSION_30_DAYS"] == -1, 0, df["PREV_READMISSION_30_DAYS"])

In [39]:
df["PREV_READMISSION_30_DAYS"].value_counts()

PREV_READMISSION_30_DAYS
0    8870
1     339
Name: count, dtype: int64

In [40]:
df.to_csv('/workspaces/master-thesis-recurrent-health-events-prediction/data/mimic-iii-preprocessed/copd_heart_failure/mimic_cleaned/last_events.csv', index=False)

# Filter Relapse Dataset

### Set Filter Parameters

In [2]:
from recurrent_health_events_prediction.data_extraction.data_types import ProgramType


SELECT_ONLY_ONE_EVENT_PER_PATIENT = True
SELECT_PATIENTS_WITH_MORE_PREV_RELAPSES = True  # select patients with more than MIN_PREV_HOSP hospitalizations
MIN_PREV_RELAPSES = 1  # minimum number of prev hospitalizations for a patient to be selecteds
SET_OBSERVATION_WINDOW_SURVIVAL = True  # set observation window for survival analysis
OBSERVATION_WINDOW = 120  # days
USE_SPECIFIC_PROGRAM_TYPE = True  # use specific program type to filter patients
SPECIFIC_PROGRAM_TYPE = ProgramType.PROBATION  # specific program type to filter patients

In [3]:
training_data_config = data_config["training_data"]["relapse"]
training_data_path = training_data_config["preprocessed_path"]
print(f"Training data path for relapse: {training_data_path}")

output_path = os.path.join(training_data_path, "mutiple_relapses_patients_probation_120_days")
os.makedirs(output_path, exist_ok=True)

outpath_filename_last_relapses = os.path.join(output_path, "last_relapses.csv")
outpath_filename_historical_relapses = os.path.join(output_path, "historical_relapses.csv")
output_path_filename_all_relapses = os.path.join(output_path, "all_relapses.csv")
output_path_filename_historical_drug_tests = os.path.join(output_path, "historical_drug_tests.csv")

print(f"Output path: {output_path}")
print("Output files: ")
print(outpath_filename_last_relapses)
print(outpath_filename_historical_relapses)
print(output_path_filename_all_relapses)
print(output_path_filename_historical_drug_tests)

Training data path for relapse: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed
Output path: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days
Output files: 
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/last_relapses.csv
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/historical_relapses.csv
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/all_relapses.csv
/workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/historical_drug_tests.csv


### Import Data

In [4]:
training_data_config = data_config["training_data"]["relapse"]
training_data_path = training_data_config["preprocessed_path"]

print(f"Training data path for relapse: {training_data_path}")

Training data path for relapse: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed


In [5]:
filename = "last_relapses.csv"
last_relapses_file_path = os.path.join(training_data_path, filename)
print(f"Using file for last relapses: {last_relapses_file_path}")  
last_relapses_df = pd.read_csv(last_relapses_file_path)

print(f"Number of last relapses: {len(last_relapses_df)}")
print("Number of unique patients in last relapses: ", last_relapses_df['DONOR_ID'].nunique())

Using file for last relapses: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/last_relapses.csv
Number of last relapses: 53022
Number of unique patients in last relapses:  53022


In [6]:
filename = "historical_relapses.csv"
historical_relapses_file_path = os.path.join(training_data_path, filename)
print(f"Using file for historical relapses: {historical_relapses_file_path}")  
historical_relapses_df = pd.read_csv(historical_relapses_file_path)

print(f"Number of historical relapses: {len(historical_relapses_df)}")
print("Number of unique patients in historical relapses: ", historical_relapses_df['DONOR_ID'].nunique())

Using file for historical relapses: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/historical_relapses.csv
Number of historical relapses: 57572
Number of unique patients in historical relapses:  23634


In [7]:
filename = "all_relapses.csv"
all_relapses_file_path = os.path.join(training_data_path, filename)
print(f"Using file for all relapses: {all_relapses_file_path}")  
all_relapses_df = pd.read_csv(all_relapses_file_path)

print(f"Number of all relapses: {len(all_relapses_df)}")
print("Number of unique patients in all relapses: ", all_relapses_df['DONOR_ID'].nunique())

Using file for all relapses: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/all_relapses.csv
Number of all relapses: 118359
Number of unique patients in all relapses:  53022


In [8]:
filename = "all_drug_tests.csv"
all_drug_tests_file_path = os.path.join(training_data_path, filename)
print(f"Using file for all drug tests: {all_drug_tests_file_path}")  
all_drug_tests_df = pd.read_csv(all_drug_tests_file_path)

print(f"Number of all drug tests: {len(all_drug_tests_df)}")
print("Number of unique patients in all drug tests: ", all_drug_tests_df['DONOR_ID'].nunique())

Using file for all drug tests: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/all_drug_tests.csv
Number of all drug tests: 1748832
Number of unique patients in all drug tests:  63193


In [9]:
filename = "historical_drug_tests.csv"
historical_drug_tests_file_path = os.path.join(training_data_path, filename)
print(f"Using file for historical drug tests: {historical_drug_tests_file_path}")
historical_drug_tests_df = pd.read_csv(historical_drug_tests_file_path)

print(f"Number of historical drug tests: {len(historical_drug_tests_df)}")
print("Number of unique patients in historical drug tests: ", historical_drug_tests_df['DONOR_ID'].nunique())

Using file for historical drug tests: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/historical_drug_tests.csv
Number of historical drug tests: 871341
Number of unique patients in historical drug tests:  53022


In [10]:
historical_drug_tests_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871341 entries, 0 to 871340
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   DONOR_ID                           871341 non-null  int64  
 1   TIME                               871341 non-null  object 
 2   COLLECTION_ID                      871341 non-null  int64  
 3   DRUG_POSITIVE                      871341 non-null  bool   
 4   SHOWEDUP                           871341 non-null  bool   
 5   PROGRAM_TYPE                       871341 non-null  object 
 6   NUM_DRUGS_TESTED                   871341 non-null  int64  
 7   DRUGS_TESTED                       871341 non-null  object 
 8   POSITIVE_DRUGS                     871341 non-null  object 
 9   NUM_DRUGS_POSITIVE                 871341 non-null  int64  
 10  FIRST_TEST_TIME                    871341 non-null  object 
 11  PARTICIPATION_DAYS                 8713

In [11]:
all_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118359 entries, 0 to 118358
Data columns (total 37 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   DONOR_ID                           118359 non-null  int64  
 1   COLLECTION_ID                      118359 non-null  int64  
 2   RELAPSE_START                      118359 non-null  object 
 3   RELAPSE_END                        118359 non-null  object 
 4   EVENT_DURATION                     118359 non-null  float64
 5   RELAPSE_EVENT                      118359 non-null  int64  
 6   NUM_TESTS_PERIOD                   118359 non-null  int64  
 7   NUM_PREV_RELAPSES                  118359 non-null  int64  
 8   RELAPSE_DURATION_CATEGORY          118359 non-null  object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  118359 non-null  int64  
 10  RELAPSE_30_DAYS                    118359 non-null  int64  
 11  PREV_POSITIVE_DRUGS                1183

### Filter Last Relapses

In [12]:
last_relapses_df.head(10)

Unnamed: 0,DONOR_ID,COLLECTION_ID,RELAPSE_START,RELAPSE_END,EVENT_DURATION,RELAPSE_EVENT,NUM_TESTS_PERIOD,NUM_PREV_RELAPSES,RELAPSE_DURATION_CATEGORY,RELAPSE_DURATION_CATEGORY_ENCODED,...,LOG_PARTICIPATION_DAYS,LOG_TIME_UNTIL_NEXT_POSITIVE,LOG_TIME_RELAPSE_PAST_MEAN,LOG_TIME_RELAPSE_PAST_MEDIAN,LOG_TIME_RELAPSE_PAST_STD,PREV_RELAPSE_30_DAYS,RELAPSE_30_DAYS_PAST_MEAN,RELAPSE_30_DAYS_PAST_SUM,IS_LAST_EVENT,IS_HISTORICAL_EVENT
0,10012,5699966,2018-05-17,2019-04-04,322.0,0,30,0,180+,3,...,7.160846,,7.156956,7.156956,,0,0.0,0.0,1,0
1,10066,1776266,2015-05-25,2016-05-03,344.0,1,31,0,180+,3,...,1.609438,5.843544,,,,0,,,1,0
2,10092,1095853,2014-10-10,2014-12-12,63.0,0,5,0,30-90,1,...,3.465736,,,,,0,,,1,0
3,10095,1591980,2015-03-11,2015-11-19,253.0,0,25,0,180+,3,...,3.401197,,,,,0,,,1,0
4,10136,1272979,2014-12-04,2015-02-10,68.0,0,4,0,30-90,1,...,2.70805,,,,,0,,,1,0
5,10141,2354200,2016-01-29,2016-05-24,116.0,1,6,4,90-180,2,...,5.545177,4.762174,3.107204,3.117205,0.968272,1,0.75,3.0,1,0
6,10173,1826720,2015-06-16,2015-06-21,5.0,1,2,1,0-30,0,...,4.317488,1.791759,2.833213,2.833213,,1,1.0,1.0,1,0
7,10219,2546456,2016-04-15,2016-04-16,1.0,1,1,11,0-30,0,...,6.09357,0.693147,2.603149,2.890372,1.112319,0,0.727273,8.0,1,0
8,10235,1592945,2015-03-11,2015-10-28,231.0,0,14,0,180+,3,...,4.356709,,,,,0,,,1,0
9,10354,1878665,2015-07-10,2016-02-05,210.0,0,6,0,180+,3,...,3.713572,,,,,0,,,1,0


In [13]:
print(f"Initial number of last relapses: {len(last_relapses_df)}")
print("Initial number of unique patients in last relapses: ", last_relapses_df['DONOR_ID'].nunique())

Initial number of last relapses: 53022
Initial number of unique patients in last relapses:  53022


In [14]:
from recurrent_health_events_prediction.preprocessing.utils import filter_select_only_one_program_type

if USE_SPECIFIC_PROGRAM_TYPE:
    print(f"Filtering last relapses for program type: {SPECIFIC_PROGRAM_TYPE}")
    # Filter last relapses for the specific program type
    # This is done to ensure that we only consider patients who are part of the specific program
    last_relapses_df = filter_select_only_one_program_type(last_relapses_df, "PROGRAM_TYPE", SPECIFIC_PROGRAM_TYPE)

    print(f"Number of last relapses after filtering for program type: {len(last_relapses_df)}")
    print("Number of unique patients in last relapses after filtering for program type: ", last_relapses_df['DONOR_ID'].nunique())

Filtering last relapses for program type: ProgramType.PROBATION
Number of last relapses after filtering for program type: 24303
Number of unique patients in last relapses after filtering for program type:  24303


In [15]:
if SELECT_PATIENTS_WITH_MORE_PREV_RELAPSES:
    print("Minimum number of previous relapses for a patient to be selected: ", MIN_PREV_RELAPSES)
    last_relapses_df = last_relapses_df[last_relapses_df["NUM_PREV_RELAPSES"] >= MIN_PREV_RELAPSES]
    print("Minimum number of previous relapses after filtering: ", last_relapses_df['NUM_PREV_RELAPSES'].min())

Minimum number of previous relapses for a patient to be selected:  1
Minimum number of previous relapses after filtering:  1


In [16]:
filter_mask = (last_relapses_df['TIME_SINCE_LAST_POSITIVE'] <= 90)
last_relapses_df = last_relapses_df[filter_mask]
print(f"Number of last relapses after filtering by TIME_SINCE_LAST_POSITIVE <= 90: {len(last_relapses_df)}")
print("Number of unique patients in last relapses after filtering: ", last_relapses_df['DONOR_ID'].nunique())

Number of last relapses after filtering by TIME_SINCE_LAST_POSITIVE <= 90: 7342
Number of unique patients in last relapses after filtering:  7342


In [17]:
donor_ids = last_relapses_df['DONOR_ID'].unique()

all_drug_tests_df = all_drug_tests_df[all_drug_tests_df['DONOR_ID'].isin(donor_ids)]
gap_time_df = all_drug_tests_df.groupby('DONOR_ID').agg(
    MAX_GAP_TIME = ('TIME_UNTIL_NEXT_TEST', "max")
).reset_index()
gap_time_df.head()

Unnamed: 0,DONOR_ID,MAX_GAP_TIME
0,10731,107.0
1,10801,188.0
2,10904,36.0
3,10994,625.0
4,11472,41.0


In [18]:
valid_donors_id_regarding_gap = gap_time_df[gap_time_df['MAX_GAP_TIME'] < 180]['DONOR_ID'].unique()

In [19]:
filter_mask = last_relapses_df.isin(valid_donors_id_regarding_gap)
last_relapses_df = last_relapses_df[last_relapses_df['DONOR_ID'].isin(valid_donors_id_regarding_gap)]
print(f"Number of last relapses after filtering by MAX_GAP_TIME < 180: {len(last_relapses_df)}")
print("Number of unique patients in last relapses after filtering: ", last_relapses_df['DONOR_ID'].nunique())

Number of last relapses after filtering by MAX_GAP_TIME < 180: 5452
Number of unique patients in last relapses after filtering:  5452


In [20]:
last_relapses_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    last_relapses_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    last_relapses_df['PARTICIPATION_DAYS'] + 90,
    last_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

last_relapses_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(last_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

In [21]:
if SELECT_ONLY_ONE_EVENT_PER_PATIENT:
    # Select only the first event for each subject
    # This is done to avoid multiple events for the same subject in the training data
    # which can lead to overfitting and biased predictions.
    print("Selecting only the first event for each patient")
    last_relapses_df = last_relapses_df.sort_values(['DONOR_ID', 'RELAPSE_START']).groupby('DONOR_ID', as_index=False).first()
    print(f"Number of last relapses after selecting just first relapse: {len(last_relapses_df)}")
    print("Number of unique patients in last relapses after selecting just first relapse: ", last_relapses_df['DONOR_ID'].nunique())

Selecting only the first event for each patient
Number of last relapses after selecting just first relapse: 5452
Number of unique patients in last relapses after selecting just first relapse:  5452


In [22]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    print("Setting observation window for survival analysis")
    print("Observation window (days): ", OBSERVATION_WINDOW)
    # Set observation window for survival analysis
    last_relapses_df = last_relapses_df.apply(set_observation_window, args=(OBSERVATION_WINDOW, 'RELAPSE_EVENT'), axis=1)
    print("Number of last relapses after setting observation window: ", len(last_relapses_df))
    print("Number of unique patients in last relapses after setting observation window: ", last_relapses_df['DONOR_ID'].nunique())
    print("Maximum and minimum event duration after setting observation window: ", last_relapses_df['EVENT_DURATION'].max(), last_relapses_df['EVENT_DURATION'].min())
    print("Unique values of RELAPSE_EVENT after setting observation window: ", last_relapses_df['RELAPSE_EVENT'].unique())

Setting observation window for survival analysis
Observation window (days):  120
Number of last relapses after setting observation window:  5452
Number of unique patients in last relapses after setting observation window:  5452
Maximum and minimum event duration after setting observation window:  120.0 1.0
Unique values of RELAPSE_EVENT after setting observation window:  [1 0]


In [23]:
last_relapses_df.EVENT_DURATION.describe()

count    5452.000000
mean       48.069516
std        42.018059
min         1.000000
25%        12.000000
50%        33.000000
75%        82.000000
max       120.000000
Name: EVENT_DURATION, dtype: float64

In [24]:
last_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5452 entries, 0 to 5451
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DONOR_ID                           5452 non-null   int64  
 1   COLLECTION_ID                      5452 non-null   int64  
 2   RELAPSE_START                      5452 non-null   object 
 3   RELAPSE_END                        5452 non-null   object 
 4   EVENT_DURATION                     5452 non-null   float64
 5   RELAPSE_EVENT                      5452 non-null   int64  
 6   NUM_TESTS_PERIOD                   5452 non-null   int64  
 7   NUM_PREV_RELAPSES                  5452 non-null   int64  
 8   RELAPSE_DURATION_CATEGORY          5452 non-null   object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  5452 non-null   int64  
 10  RELAPSE_30_DAYS                    5452 non-null   int64  
 11  PREV_POSITIVE_DRUGS                5452 non-null   objec

In [25]:
print("Exporting last relapses to CSV file: ", outpath_filename_last_relapses)
last_relapses_df.to_csv(outpath_filename_last_relapses, index=False)

Exporting last relapses to CSV file:  /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/last_relapses.csv


### Filter Historical Relapses

In [26]:
donor_ids = last_relapses_df['DONOR_ID'].unique()
historical_relapses_df = historical_relapses_df[historical_relapses_df['DONOR_ID'].isin(donor_ids)]

print(f"Number of historical relapses after filtering by DONOR_ID: {len(historical_relapses_df)}")
print("Number of unique patients in historical relapses after filtering: ", historical_relapses_df['DONOR_ID'].nunique())

Number of historical relapses after filtering by DONOR_ID: 14856
Number of unique patients in historical relapses after filtering:  5452


In [27]:
historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    historical_relapses_df['PARTICIPATION_DAYS'] + 90,
    historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

historical_relapses_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(historical_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

In [28]:
historical_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14856 entries, 25 to 57571
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DONOR_ID                           14856 non-null  int64  
 1   COLLECTION_ID                      14856 non-null  int64  
 2   RELAPSE_START                      14856 non-null  object 
 3   RELAPSE_END                        14856 non-null  object 
 4   EVENT_DURATION                     14856 non-null  float64
 5   RELAPSE_EVENT                      14856 non-null  int64  
 6   NUM_TESTS_PERIOD                   14856 non-null  int64  
 7   NUM_PREV_RELAPSES                  14856 non-null  int64  
 8   RELAPSE_DURATION_CATEGORY          14856 non-null  object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  14856 non-null  int64  
 10  RELAPSE_30_DAYS                    14856 non-null  int64  
 11  PREV_POSITIVE_DRUGS                14856 non-null  object 

In [29]:
historical_relapses_df.to_csv(outpath_filename_historical_relapses, index=False)

### Filter All Relapses

In [30]:
donor_ids = last_relapses_df['DONOR_ID'].unique()
all_relapses_df = all_relapses_df[all_relapses_df['DONOR_ID'].isin(donor_ids)]

print(f"Number of all relapses after filtering by DONOR_ID: {len(all_relapses_df)}")
print("Number of unique patients in all relapses after filtering: ", all_relapses_df['DONOR_ID'].nunique())

Number of all relapses after filtering by DONOR_ID: 22867
Number of unique patients in all relapses after filtering:  5452


In [31]:
all_relapses_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    all_relapses_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    all_relapses_df['PARTICIPATION_DAYS'] + 90,
    all_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

all_relapses_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(all_relapses_df['TIME_SINCE_LAST_NEGATIVE'])

In [32]:
all_relapses_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22867 entries, 40 to 118358
Data columns (total 37 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   DONOR_ID                           22867 non-null  int64  
 1   COLLECTION_ID                      22867 non-null  int64  
 2   RELAPSE_START                      22867 non-null  object 
 3   RELAPSE_END                        22867 non-null  object 
 4   EVENT_DURATION                     22867 non-null  float64
 5   RELAPSE_EVENT                      22867 non-null  int64  
 6   NUM_TESTS_PERIOD                   22867 non-null  int64  
 7   NUM_PREV_RELAPSES                  22867 non-null  int64  
 8   RELAPSE_DURATION_CATEGORY          22867 non-null  object 
 9   RELAPSE_DURATION_CATEGORY_ENCODED  22867 non-null  int64  
 10  RELAPSE_30_DAYS                    22867 non-null  int64  
 11  PREV_POSITIVE_DRUGS                22867 non-null  object

In [33]:
if SET_OBSERVATION_WINDOW_SURVIVAL:
    all_relapses_df = all_relapses_df.apply(set_observation_window, args=(OBSERVATION_WINDOW, 'RELAPSE_EVENT'), axis=1)

In [34]:
print("Maximum and minimum event duration after setting observation window: ", all_relapses_df['EVENT_DURATION'].max(), all_relapses_df['EVENT_DURATION'].min())
print("Unique values of RELAPSE_EVENT after setting observation window: ", all_relapses_df['RELAPSE_EVENT'].unique())

Maximum and minimum event duration after setting observation window:  120.0 1.0
Unique values of RELAPSE_EVENT after setting observation window:  [1 0]


In [35]:
print("Exporting last relapses to CSV file: ", output_path_filename_all_relapses)
all_relapses_df = all_relapses_df.to_csv(output_path_filename_all_relapses, index=False)

Exporting last relapses to CSV file:  /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/all_relapses.csv


### Filter Historical Drug Tests

In [36]:
donor_ids = last_relapses_df['DONOR_ID'].unique()
historical_drug_tests_df = historical_drug_tests_df[historical_drug_tests_df['DONOR_ID'].isin(donor_ids)]

print(f"Number of historical drug tests after filtering by DONOR_ID: {len(historical_drug_tests_df)}")
print("Number of unique patients in historical drug tests after filtering: ", historical_drug_tests_df['DONOR_ID'].nunique())

Number of historical drug tests after filtering by DONOR_ID: 129278
Number of unique patients in historical drug tests after filtering:  5452


In [37]:
historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'] = np.where(
    historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'].isna(),
    historical_drug_tests_df['PARTICIPATION_DAYS'] + 90,
    historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'])


historical_drug_tests_df['LOG_TIME_SINCE_LAST_NEGATIVE'] = np.log(historical_drug_tests_df['TIME_SINCE_LAST_NEGATIVE'])

In [38]:
historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'] = np.where(
    historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'].isna(),
    historical_drug_tests_df['PARTICIPATION_DAYS'] + 30,
    historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'])

historical_drug_tests_df['LOG_TIME_SINCE_LAST_POSITIVE'] = np.log(historical_drug_tests_df['TIME_SINCE_LAST_POSITIVE'])

In [39]:
historical_drug_tests_df = historical_drug_tests_df.to_csv(output_path_filename_historical_drug_tests, index=False)

### Metadata

In [36]:
control_variables = {
    "SELECT_ONLY_ONE_EVENT_PER_PATIENT": SELECT_ONLY_ONE_EVENT_PER_PATIENT,
    "SET_OBSERVATION_WINDOW_SURVIVAL": SET_OBSERVATION_WINDOW_SURVIVAL,
    "OBSERVATION_WINDOW": OBSERVATION_WINDOW,
    "SELECT_PATIENTS_WITH_MORE_PREV_HOSP": SELECT_PATIENTS_WITH_MORE_PREV_RELAPSES,
    "MIN_PREV_HOSP": MIN_PREV_RELAPSES
}

output_file_controls = os.path.join(output_path, "control_variables.yaml")

with open(output_file_controls, 'w') as f:
    yaml.dump(control_variables, f)

print(f"Control variables exported to: {output_file_controls}")

Control variables exported to: /workspaces/master-thesis-recurrent-health-events-prediction/data/avh-data-preprocessed/mutiple_relapses_patients_probation_120_days/control_variables.yaml
