In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv("/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/train_test/train_tuning.csv")
validation_df = pd.read_csv("/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/train_test/validation_tuning.csv")
full_train_df = pd.read_csv("/workspaces/msc-thesis-recurrent-health-modeling/data/mimic-iii-preprocessed/copd_hf_renal_diabetes/multiple_hosp_patients/train_test/train_full.csv")

In [3]:
train_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,HOSPITALIZATION_DAYS,NUM_COMORBIDITIES,TYPES_COMORBIDITIES,HAS_DIABETES,HAS_COPD,HAS_CONGESTIVE_HF,...,ADMISSION_TYPE_URGENT,INSURANCE_GOVERNMENT,INSURANCE_MEDICAID,INSURANCE_MEDICARE,INSURANCE_PRIVATE,ETHNICITY_BLACK,ETHNICITY_HISPANIC,ETHNICITY_WHITE,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_POST_ACUTE_CARE
0,124,172461,2160-06-24 21:25:00,2160-07-15 15:10:00,20.739583,3,"['other', 'chronic_pulmonary_disease', 'cerebr...",False,True,False,...,False,False,False,False,True,False,False,True,True,False
1,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,7.497222,3,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,True,False
2,124,134369,2165-05-21 21:02:00,2165-06-06 16:00:00,15.790278,4,"['other', 'peripheral_vascular_disease', 'chro...",False,True,False,...,False,False,False,True,False,False,False,True,False,True
3,138,108120,2131-10-31 08:00:00,2131-11-06 12:54:00,6.204167,4,"['other', 'congestive_heart_failure', 'myocard...",False,True,True,...,False,False,False,False,True,False,False,True,True,False
4,138,188284,2133-12-21 23:52:00,2134-01-11 13:00:00,20.547222,3,"['other', 'congestive_heart_failure', 'maligna...",False,False,True,...,False,False,False,False,True,False,False,True,True,False


In [4]:
assert train_df.shape[0] + validation_df.shape[0] == full_train_df.shape[0], "Train and validation sets do not sum up to full training set."

In [5]:
train_event_ids = set(train_df['HADM_ID'].tolist())
validation_event_ids = set(validation_df['HADM_ID'].tolist())
full_train_event_ids = set(full_train_df['HADM_ID'].tolist())

subject_ids_in_train = set(train_df['SUBJECT_ID'].tolist())
subject_ids_in_validation = set(validation_df['SUBJECT_ID'].tolist())
subject_ids_in_full_train = set(full_train_df['SUBJECT_ID'].tolist())

assert subject_ids_in_train.isdisjoint(subject_ids_in_validation), "Train and validation sets have overlapping SUBJECT_IDs."
assert subject_ids_in_train.union(subject_ids_in_validation) == subject_ids_in_full_train, "Union of train and validation sets does not equal full training set."
assert train_event_ids.isdisjoint(validation_event_ids), "Train and validation sets have overlapping HADM_IDs."
assert train_event_ids.union(validation_event_ids) == full_train_event_ids, "Union of train and validation event IDs does not equal full training event IDs."

In [6]:
print("Num of samples in train set:", train_df.shape[0])
print("Num of samples in validation set:", validation_df.shape[0])
print("Num of samples in full train set:", full_train_df.shape[0])

Num of samples in train set: 3888
Num of samples in validation set: 904
Num of samples in full train set: 4792


In [9]:
train_df["SUBJECT_ID"].nunique(), validation_df["SUBJECT_ID"].nunique(), full_train_df["SUBJECT_ID"].nunique()

(1201, 301, 1502)

In [8]:
mask = (full_train_df["IS_LAST_EVENT"] == 1) & (full_train_df["NEXT_ADMISSION_TYPE"] != "ELECTIVE")
last_events_df = full_train_df[mask]
last_events_df["SUBJECT_ID"].nunique()

1412

In [41]:
import torch

base_path = "/workspaces/msc-thesis-recurrent-health-modeling/_models/mimic/deep_learning/attention_pooling_min/multiple_hosp_patients/"
full_train_dataset = torch.load(base_path + "train_full_dataset.pt", weights_only=False)
test_dataset = torch.load(base_path + "test_dataset.pt", weights_only=False)
train_final_dataset = torch.load(base_path + "train_final_dataset.pt", weights_only=False)
validation_final_dataset = torch.load(base_path + "validation_final_dataset.pt", weights_only=False)
last_events_dataset = torch.load(base_path + "last_events_dataset.pt", weights_only=False)

In [46]:
len(test_dataset)

1085

In [47]:
test_samples = pd.Series(test_dataset.subject_ids).nunique()
print("Num of unique SUBJECT_IDs in test dataset:", test_samples)

Num of unique SUBJECT_IDs in test dataset: 371


In [48]:
import numpy as np

y = np.array([sample["y"] for sample in test_dataset.samples])

In [49]:
y.mean()

np.float64(0.2571428571428571)

In [40]:
len(validation_final_dataset)

431

In [42]:
len(last_events_dataset)

353