In [1]:
import yaml
import math
import random
import sparse
import h5py
import pandas as pd
import numpy as np

# Get the CCS Code of each ICD9_CODE in the DIAGNOSES_ICD table.

In [2]:
ccs_dir = "../../ccs/hcup_ccs_2015_definitions.yaml"

### ccs_data is loaded as a dict.

In [3]:
with open(ccs_dir, "r") as f:
    ccs_data = yaml.load(f, Loader=yaml.FullLoader)

### Expand the "use_in_benchmark" and "id" to the same length as codes.

In [4]:
for key in ccs_data.keys():
    length = len(ccs_data[key]["codes"])
    ccs_data[key]["use_in_benchmark"] = [int(ccs_data[key]["use_in_benchmark"])] * length
    ccs_data[key]["id"] = [ccs_data[key]["id"]] * length

In [6]:
len(ccs_data)

283

In [7]:
d_diag = pd.read_csv("/home/data/datasets/mimic-III/tables/D_ICD_DIAGNOSES.csv")

In [9]:
d_diag.shape[0]

14567

### Merge the data of each key.

In [5]:
ids = []
codes = []
use_in_benchmarks = []

for key in ccs_data.keys():
    ids += ccs_data[key]["id"]
    codes += ccs_data[key]["codes"]
    use_in_benchmarks += ccs_data[key]["use_in_benchmark"]

### Build the DataFrame with the data.

In [6]:
ccs_df = pd.DataFrame({"ICD9_CODE": codes, "CCS": ids, "USED": use_in_benchmarks})

### Merge ccs_df and DIAGNOSES_ICD on ICD9_CODE. 

In [7]:
diag_dir = "../../tables/DIAGNOSES_ICD.csv"

In [8]:
diag = pd.read_csv(diag_dir)

In [9]:
diag = diag[diag["ICD9_CODE"].notna()]

In [10]:
diag = diag.merge(ccs_df, how="left", on="ICD9_CODE")

### Drop the CCS Codes which are not used.

In [11]:
diag = diag[diag["USED"] == 1]

### Map the CCS Codes to index.

In [12]:
unique_ccs_codes = sorted(diag["CCS"].value_counts().index.to_numpy())

In [13]:
diag["INDEX"] = diag["CCS"].map(lambda x: unique_ccs_codes.index(x))

# Get the diagnoses multihot label and merge to mortality label.

### Remove the one HADM_ID to multiple ICU_STAY_ID.

read icu table; use drop_duplicates() to remove the illegal HADM_ID; merge label table with icu; drop null HADM_ID; merge diag table; get multi-hot.

### Drop duplicate HADM_ID.

In [14]:
icu_dir = "../../tables/ICUSTAYS.csv"

In [15]:
icu = pd.read_csv(icu_dir)

In [16]:
icu = icu.drop_duplicates(subset=["HADM_ID"], keep=False)

### Merge Mortality label and icu.

In [17]:
mort_label_dir = "../../processed/population/mortality_48.0h.csv"

In [18]:
mort_label = pd.read_csv(mort_label_dir)

In [19]:
mort_label.rename(columns={"ID": "ICUSTAY_ID"}, inplace=True)

In [20]:
mort_label = pd.merge(mort_label[["ICUSTAY_ID", "mortality_LABEL"]], icu[["ICUSTAY_ID", "HADM_ID"]], how="left", on="ICUSTAY_ID")

### Drop the HADM_ID with multiple ICUSTAY_ID.

In [21]:
mort_label = mort_label[mort_label["HADM_ID"].notna()]

In [22]:
mort_label["HADM_ID"] = mort_label["HADM_ID"].astype(int)

### Get the CCS index of each ICUSTAY_ID/HADM_ID.

In [23]:
mort_label = pd.merge(mort_label, diag[["HADM_ID", "INDEX"]], on="HADM_ID", how="left")

### Drop the ICUSTAY_ID without INDEX. Could be without ICD9_CODE or the ICD9_CODE is not used.

In [24]:
mort_label = mort_label[mort_label["INDEX"].notna()]

In [25]:
mort_label["INDEX"] = mort_label["INDEX"].astype(int)

### Build the multi-hot label for Diagnoses.

In [26]:
diag_label = mort_label.groupby("ICUSTAY_ID")["INDEX"].apply(lambda x: x.to_numpy()).reset_index()

In [27]:
def multihot(x):
    temp = np.zeros(25)
    temp[x] = 1
    return temp

In [28]:
diag_label["diagnoses_LABEL"] = diag_label["INDEX"].map(multihot)

### Merge the diagnoses label and mortality label.

In [29]:
diag_mort_label = pd.merge(diag_label[["ICUSTAY_ID", "diagnoses_LABEL"]], mort_label[["ICUSTAY_ID", "mortality_LABEL"]], on="ICUSTAY_ID", how="left").drop_duplicates(subset="ICUSTAY_ID")

# Get the LOS label and merge to diag_mort_label.

In [30]:
los = pd.merge(diag_mort_label["ICUSTAY_ID"], icu[["ICUSTAY_ID", "LOS"]], on="ICUSTAY_ID", how="left")

### Classify the LOS into 9 categories: 1, 2, 3, 4, 5, 6, 7, 8-14, >14。

In [31]:
def classify(x):
    x = math.floor(x - 2)
    if x <= 6:
        return x
    elif 6 < x <= 13:
        return 7
    else:
        return 8

In [32]:
los["INDEX"] = los["LOS"].map(classify)

### Build the one-hot vector

In [33]:
def onehot(x):
    temp = np.zeros(9)
    temp[x] = 1
    return temp

In [34]:
los["los_LABEL"] = los["INDEX"].map(onehot)

### Merge los to diag_mort_label.

In [35]:
los_diag_mort_label = pd.merge(los[["ICUSTAY_ID", "los_LABEL"]], diag_mort_label, on="ICUSTAY_ID", how="left")

# Split the data into train, val and test.

### Get the ids

In [36]:
ids = set(los_diag_mort_label["ICUSTAY_ID"].to_numpy())

In [37]:
print(len(ids))

9734


In [37]:
train_ids = set(random.sample(ids, int(len(ids) * 0.7)))

In [38]:
val_ids = set(random.sample(ids - train_ids, int(len(ids) * 0.15)))

In [39]:
test_ids = ids - train_ids - val_ids

### Get the partition of each id.

In [40]:
def split(x):
    if x in train_ids:
        return "train"
    elif x in val_ids:
        return "val"
    else:
        return "test"

In [41]:
los_diag_mort_label["partition"] = los_diag_mort_label["ICUSTAY_ID"].map(split)

## Build the features.

In [42]:
time_series_dir = "../../processed/features/outcome=Mortality,T=48.0,dt=1.0/X.npz"
time_invariant_dir = "../../processed/features/outcome=Mortality,T=48.0,dt=1.0/s.npz"

In [43]:
time_series = sparse.load_npz(time_series_dir).todense()
time_invariant = sparse.load_npz(time_invariant_dir).todense()

### Assume the entries of time_series and time_invariant are corresponding to the entires in mort_label.

In [44]:
id_feats = pd.read_csv(mort_label_dir)

In [45]:
id_feats.rename(columns={"ID": "ICUSTAY_ID"}, inplace=True)

### Get the features of each ID.

In [46]:
id_feats["time_series"] = [item for item in time_series]
id_feats["time_invariant"] = [item for item in time_invariant]

### Merge to los_diag_mort_label

In [47]:
los_diag_mort_data = pd.merge(los_diag_mort_label, id_feats[["ICUSTAY_ID", "time_series", "time_invariant"]], on="ICUSTAY_ID", how="left")

# Save the data to hdf5.

In [48]:
def save(df, partition, hdf):
    group = hdf.create_group(partition)
    for col in df.columns:
        data = np.stack(df[col].to_numpy())
        group.create_dataset(col, data=data)

In [49]:
partitions = ["train", "val", "test"]

In [50]:
hdf = h5py.File("data.hdf5", "a")

In [51]:
for partition in partitions:
    df = los_diag_mort_data[los_diag_mort_data["partition"] == partition]
    save(df.drop(columns="partition"), partition, hdf)

In [52]:
hdf.close()
