In [1]:
import pandas as pd
import numpy as np
import os 
from rich.progress import track


In [2]:
file_path = "./files/"
if not os.path.exists(file_path):
    os.mkdir(file_path)
    
file_list = ["set-a.tar.gz","set-b.tar.gz","set-c.tar.gz","Outcomes-a.txt","Outcomes-b.txt","Outcomes-c.txt"]
for file in file_list:
    if os.path.exists(file_path+file):
        continue
    os.system("wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/%s -nd -P %s"%(file,file_path))
    print("wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/%s -nd -P %s"%(file,file_path))
    print("%s is downloaded!"%(file))
    if file[-7:] == ".tar.gz":
        os.system("tar -zxvf %s/%s -C %s"%(file_path,file,file_path))
        print("%s is unzipped!"%(file))


wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/set-a.tar.gz -nd -P ./files/
set-a.tar.gz is downloaded!
set-a.tar.gz is unzipped!
wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/set-b.tar.gz -nd -P ./files/
set-b.tar.gz is downloaded!
set-b.tar.gz is unzipped!
wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/set-c.tar.gz -nd -P ./files/
set-c.tar.gz is downloaded!
set-c.tar.gz is unzipped!
wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/Outcomes-a.txt -nd -P ./files/
Outcomes-a.txt is downloaded!
wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/Outcomes-b.txt -nd -P ./files/
Outcomes-b.txt is downloaded!
wget -r -N -c -np https://physionet.org/files/challenge-2012/1.0.0/Outcomes-c.txt -nd -P ./files/
Outcomes-c.txt is downloaded!


In [3]:
demo_num_features = ["Age", 'Height', 'Weight']
demo_cat_features = ['Gender', 'ICUType']
# Gender (0: female, or 1: male)
# ICUType (1: Coronary Care Unit, 2: Cardiac Surgery Recovery Unit, 3: Medical ICU, or 4: Surgical ICU)

demographic_features = demo_num_features + demo_cat_features
time_num_features = ['ALP', 'ALT', 'AST', 'Albumin', 'BUN', 'Bilirubin', 'Cholesterol', 'Creatinine', 
                 'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'MAP', 
                 'Mg', 'NIDiasABP', 'NIMAP', 'NISysABP', 'Na', 'PaCO2', 'PaO2', 
                 'Platelets', 'RespRate', 'SaO2', 'SysABP', 'Temp', 'TroponinI', 'TroponinT', 
                 'Urine', 'WBC', 'Weight', 'pH']
time_cat_features = ['MechVent']
time_features = time_num_features + time_cat_features
print('demographic_features: %d  time_features:%d ' % (len(demographic_features),len(time_features)))

demographic_features: 5  time_features:37 


In [4]:
# combine labels
label_file = ['Outcomes-a.txt','Outcomes-b.txt','Outcomes-c.txt']
label = pd.DataFrame()
for name in label_file:
    tmp = pd.read_csv(file_path+name)
    label = pd.concat([label,tmp])
label = label.set_index('RecordID')
name_labels = list(label.columns)
print(name_labels)
label

['SAPS-I', 'SOFA', 'Length_of_stay', 'Survival', 'In-hospital_death']


Unnamed: 0_level_0,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
RecordID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
132539,6,1,5,-1,0
132540,16,8,8,-1,0
132541,21,11,19,-1,0
132543,7,1,9,575,0
132545,17,2,4,918,0
...,...,...,...,...,...
163029,18,8,17,-1,0
163033,9,1,9,-1,0
163034,13,10,8,-1,0
163035,20,9,71,96,0


In [5]:
# combine samples 
data_path = "./data/"
if not os.path.exists(data_path):
    os.mkdir(data_path)

for sub_dataset_file in ['./set-a','./set-b','./set-c']:
    os.system("mv %s/%s/* %s/"%(file_path,sub_dataset_file,data_path))

files_list = os.listdir(data_path)
files = []
for file in files_list:
    if file[-3:]=="txt":
        files.append(file)
files.sort()
assert len(files)==12000,"[x] DownloadError: Please delete files and download again!"

len(files),files[:5]

(12000, ['132539.txt', '132540.txt', '132541.txt', '132543.txt', '132545.txt'])

In [7]:
# generate irregular dataset

def process_sample(file):
    admission_id = int(file[:-4])
    f = open(data_path+file)
    lines = f.readlines()
    sample_info = np.full(fill_value=-1,shape=len(demo_num_features)+2+4).tolist()
    
    sample_re_stime = [3600*i for i in range(48)]
    sample_re_tdata = np.full((48,len(time_features)),0.0)
    sample_re_tmask = np.full((48,len(time_features)),0.0)
    
    sample_ir_stime = []
    sample_ir_tdata = []
    sample_ir_tmask = []

    for line in lines[1:]:
        record = line[:-1].split(",")
        stime = record[0].split(":")
        time = (int(stime[0])*60 + int(stime[1]))*60
        
        if time not in sample_ir_stime:
            sample_ir_stime.append(time)
            sample_ir_tdata.append(np.full(len(time_features),0.0))
            sample_ir_tmask.append(np.full(len(time_features),0.0))
        
        t_index = sample_ir_stime.index(time)
        if record[1] in demographic_features and time==0:
            f_index = demographic_features.index(record[1])
            if f_index < len(demo_num_features):
                if f_index == 1 and float(record[2]) < 100:
                    continue
                elif f_index == 2 and float(record[2]) < 30:
                    continue
                sample_info[f_index] = float(record[2])
            elif record[1] == demo_cat_features[0] and float(record[2])!=-1:
                sample_info[len(demo_num_features)+int(record[2])] = 1
            elif record[1] == demo_cat_features[1] and float(record[2])!=-1:
                sample_info[len(demo_num_features)+int(record[2])+1] = 1
            else:
                print(record)
                
        elif record[1] in time_features and record[2]!='':
            f_index = time_features.index(record[1])

            # outcome-related descriptors are non-negative (≥ 0)#####    
            if float(record[2])<0:
                continue
            # pH > 5 and pH < 8 
            if f_index==time_features.index('pH') and  (float(record[2])<5 or float(record[2])>8):
                continue
            # Weight > 10 
            if f_index==time_features.index('Weight') and  (float(record[2])<10 ):
                continue
            # Temp > 10 
            if f_index==time_features.index('Temp') and  (float(record[2])<10 ):
                continue

            f_index = time_features.index(record[1])
            sample_ir_tdata[t_index][f_index]= float(record[2])
            sample_ir_tmask[t_index][f_index]= 1
            
            regular_time_index = int((time-1)/3600)
            sample_re_tdata[regular_time_index][f_index]= float(record[2])
            sample_re_tmask[regular_time_index][f_index]= 1

    admission_label = list(label.loc[admission_id])
    
    np.savez(data_path+str(admission_id),
             info = sample_info,
             regular_data = {
                 "stime": sample_re_stime,
                 "tdata": sample_re_tdata,
                 "tmask": sample_re_tmask,
             },
             irregular_data = {
                 "stime": sample_ir_stime,
                 "tdata": sample_ir_tdata,
                 "tmask": sample_ir_tmask,
             },

             labels = admission_label,
            )
    
for file in track(files):
    process_sample(file)
# process_sample("137392.txt")

Output()

['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']
['00:00', 'Gender', '-1']


In [6]:
np.array(label.index),np.array(label['In-hospital_death'])

(array([132539, 132540, 132541, ..., 163034, 163035, 163037]),
 array([0, 0, 0, ..., 0, 0, 0]))

In [7]:
from sklearn.model_selection import train_test_split, KFold
def get_folds(indices, array_y, num=5):
    folds = []
    for i in range(num):
        index_train, index_valid_test, Y_train, Y_valid_test = train_test_split(indices, array_y, test_size=0.2, random_state=2012+i,
                                                               shuffle=True,stratify=array_y)
        index_valid, index_test, Y_valid, Y_test = train_test_split(index_valid_test, Y_valid_test, test_size=0.5, random_state=2012+i,
                                                               shuffle=True,stratify=Y_valid_test)

        print("[train] 1:%d 0:%d all:%d" %(sum(Y_train),len(Y_train)-sum(Y_train),len(Y_train)))
        print("[valid] 1:%d 0:%d all:%d" %(sum(Y_valid),len(Y_valid)-sum(Y_valid),len(Y_valid)))
        print("[test]  1:%d 0:%d all:%d" %(sum(Y_test),len(Y_test)-sum(Y_test),len(Y_test)))
        folds.append([index_train,index_valid,index_test])
    return folds


In [8]:
indices = np.array(label.index)
array_y = np.array(label['In-hospital_death'])
folds = get_folds(indices, array_y)
folds

[train] 1:1366 0:8234 all:9600
[valid] 1:171 0:1029 all:1200
[test]  1:170 0:1030 all:1200
[train] 1:1366 0:8234 all:9600
[valid] 1:171 0:1029 all:1200
[test]  1:170 0:1030 all:1200
[train] 1:1366 0:8234 all:9600
[valid] 1:171 0:1029 all:1200
[test]  1:170 0:1030 all:1200
[train] 1:1366 0:8234 all:9600
[valid] 1:171 0:1029 all:1200
[test]  1:170 0:1030 all:1200
[train] 1:1366 0:8234 all:9600
[valid] 1:170 0:1030 all:1200
[test]  1:171 0:1029 all:1200


[[array([150491, 143937, 148710, ..., 150063, 135326, 140879]),
  array([134391, 148241, 145189, ..., 141786, 156673, 149901]),
  array([134205, 151033, 147324, ..., 136306, 158770, 132767])],
 [array([143784, 134576, 142356, ..., 135077, 146068, 150891]),
  array([134063, 152733, 154494, ..., 147317, 156459, 142035]),
  array([158157, 138649, 143646, ..., 162090, 154211, 138961])],
 [array([142332, 162316, 149033, ..., 152087, 154143, 146790]),
  array([157594, 147753, 136638, ..., 140088, 144938, 140016]),
  array([147204, 161066, 153946, ..., 151962, 141244, 146571])],
 [array([141183, 158862, 134157, ..., 150889, 153850, 146588]),
  array([133023, 154329, 138378, ..., 133274, 145690, 135859]),
  array([145172, 151441, 153697, ..., 145900, 161062, 138848])],
 [array([149092, 157974, 154688, ..., 149277, 146466, 142378]),
  array([148798, 158168, 143387, ..., 162154, 149290, 158739]),
  array([157204, 147736, 154271, ..., 146018, 140341, 155691])]]

In [9]:
info_list = []
for file in track(files):
    data = np.load(data_path+"%s.npz"%(file[:-4]))
    info_list.append(data['info'])
len(info_list)

Output()

12000

In [10]:
info_dict = {}
for i in range(9):
    info_dict[i] = []
for sub_info in info_list:
    for i in range(9):
        if sub_info[i] != -1:
            info_dict[i].append(sub_info[i])
print(demographic_features[0],len(info_dict[0]))
print(demographic_features[1],len(info_dict[1]))
print(demographic_features[2],len(info_dict[2]))
print(demographic_features[3],len(info_dict[3]+info_dict[4]))
print(demographic_features[4],len(info_dict[5] + info_dict[6] +  info_dict[7] + info_dict[8]))

Age 12000
Height 6257
Weight 10997
Gender 11988
ICUType 12000


In [11]:
# missing gender
for i,sub_info in enumerate(info_list):
    if sub_info[3] + sub_info[4] == -2:
        print(files[i], sub_info)

135757.txt [ 43.  180.3 123.   -1.   -1.   -1.   -1.    1.   -1. ]
137392.txt [43. -1. -1. -1. -1. -1. -1.  1. -1.]
141486.txt [85.  -1.  56.7 -1.  -1.  -1.  -1.  -1.   1. ]
143896.txt [47.  -1.  49.9 -1.  -1.  -1.  -1.   1.  -1. ]
144078.txt [86.  -1.  59.8 -1.  -1.  -1.  -1.   1.  -1. ]
146615.txt [34.  -1.  73.8 -1.  -1.  -1.  -1.   1.  -1. ]
147570.txt [ 38.   -1.  133.6  -1.   -1.   -1.   -1.    1.   -1. ]
148436.txt [ 78.  185.4  83.7  -1.   -1.   -1.    1.   -1.   -1. ]
156222.txt [ 26.  188.   97.7  -1.   -1.   -1.    1.   -1.   -1. ]
156244.txt [59. -1. -1. -1. -1. -1.  1. -1. -1.]
156818.txt [79. -1. -1. -1. -1. -1. -1.  1. -1.]
160470.txt [78. -1. -1. -1. -1. -1. -1.  1. -1.]


In [12]:
# demographic infomation norm
def get_norm_demo(info_list,folds):
    print("============== norm of demographic features =================")
#     info_list = []
#     for file in track(files):
#         data = np.load(data_path+"%s.npz"%(file[:-4]))
#         info_list.append(data['info'])
#     info_array = np.array(info_list)
        
    demo_mean = []
    demo_std = []
    demo_min = []
    demo_max = []
    for fold_id, sub_fold in enumerate(folds):
        print("fold%2d"%fold_id)
        sub_infos = []
        for index in sub_fold[0]:
            file_index = files.index("%s.txt"%index)
            sub_infos.append(info_list[file_index])

        info_dict = {}
        for i in range(9):
            info_dict[i] = []
        for sub_info in sub_infos:
            for i in range(9):
                if sub_info[i] != -1:
                    info_dict[i].append(sub_info[i])
        
        demo_mean.append([np.average(info_dict[0]),np.average(info_dict[1]),np.average(info_dict[2])])
        demo_std.append([np.std(info_dict[0]),np.std(info_dict[1]),np.std(info_dict[2])])
        demo_min.append([np.min(info_dict[0]),np.min(info_dict[1]),np.min(info_dict[2])])
        demo_max.append([np.max(info_dict[0]),np.max(info_dict[1]),np.max(info_dict[2])])
    return {
        "max": demo_max,
        "min": demo_min,
        "avg": demo_mean,
        "std": demo_std,
    } 

In [13]:
def get_norm_result(samples_data,samples_mask,folds):
    print("============== norm of time-series features =================")
    samples_data = np.array(samples_data)
    samples_mask = np.array(samples_mask)

    mean_mortality = []
    std_mortality = []
    min_mortality = []
    max_mortality = []

    for fold_id, sub_fold in enumerate(folds):
        print("fold%2d"%fold_id)

        sub_data = []
        sub_mask = []
        for index in sub_fold[0]:
            file_index = files.index("%s.txt"%index)
            sub_data.append(samples_data[file_index])
            sub_mask.append(samples_mask[file_index])

        recoders = {}
        for key in range(len(time_features)):
            recoders[key] = []

        for i,sub in enumerate(sub_data):
            for t,t_data in enumerate(sub):
                for f,f_data in enumerate(t_data):
                    if sub_mask[i][t][f] == 1:
                        recoders[f].append(f_data)

        mor_mean = []
        mor_std = []
        mor_min = []
        mor_max = []
        for key in range(len(time_features)):
            if key < len(time_num_features):
                mor_mean.append(np.mean(recoders[key]))
                mor_std.append(np.std(recoders[key]))
                mor_min.append(np.min(recoders[key]))
                mor_max.append(np.max(recoders[key]))
            if key >= len(time_num_features):
                mor_mean.append(0)
                mor_std.append(1)
                mor_min.append(0)
                mor_max.append(1)

        mean_mortality.append(mor_mean)
        std_mortality.append(mor_std)
        min_mortality.append(mor_min)
        max_mortality.append(mor_max)
    return {
        "max": max_mortality,
        "min": min_mortality,
        "avg": mean_mortality,
        "std": std_mortality,
    }

In [37]:
re_samples_data = []
re_samples_mask = []

ir_samples_data = []
ir_samples_mask = []

for file in files:
    data = np.load(data_path+"%s.npz"%(file[:-4]))
    re_samples_data.append(data['regular_data'][()]['tdata'])
    re_samples_mask.append(data['regular_data'][()]['tmask'])
    ir_samples_data.append(data['irregular_data'][()]['tdata'])
    ir_samples_mask.append(data['irregular_data'][()]['tmask'])

In [38]:
demo_norm = get_norm_demo(info_list,folds)
regular_norm = get_norm_result(re_samples_data,re_samples_mask,folds)
irregular_norm = get_norm_result(ir_samples_data,ir_samples_mask,folds)

fold 0
fold 1
fold 2
fold 3
fold 4
fold 0
fold 1
fold 2
fold 3
fold 4
fold 0
fold 1
fold 2
fold 3
fold 4


In [41]:
np.savez('inhos_mortality_folds',
         fold_tvt = np.array(folds),
         input_dim = 37,
         info_dim = 9,
         demo_norm = demo_norm,
         regular_norm = regular_norm,
         irregular_norm = irregular_norm,
         time_features = {
             "num_features": time_num_features,
             "cat_features": time_cat_features, 
         },
         demo_features = {
             "num_features": demo_num_features,
             "cat_features": demo_cat_features, 
         },
         labels = name_labels,
        )

In [108]:
temp = np.load("inhos_mortality_folds.npz")
list(temp.keys())

['fold_tvt',
 'input_dim',
 'info_dim',
 'demo_norm',
 'regular_norm',
 'irregular_norm',
 'time_features',
 'demo_features']

In [53]:
# os.system("rm %s/*.txt"%(data_path))

0