In [1]:
import numpy as np
import pandas as pd
import pickle
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore", category=Warning)

# 1. Create model dataset from MediSyn data

In [2]:
diagnose = pd.read_csv("DIAGNOSES_ICD.csv")

In [3]:
diagnose.dtypes

Unnamed: 0     int64
SUBJECT_ID     int64
HADM_ID        int64
SEQ_NUM        int64
ICD9_CODE     object
dtype: object

In [4]:
diagnose.head(5)

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,0,1,100000,0,51881
1,1,1,100000,1,5990
2,2,1,100000,2,5849
3,3,1,100000,3,2765
4,4,1,100000,4,2449


In [None]:
diagnose.tail(5)

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
558399,558399,50000,154293,12,44020
558400,558400,50000,154293,13,7230
558401,558401,50000,154293,14,99641
558402,558402,50000,154293,15,1552
558403,558403,50000,154293,16,34460


## Create labels.pkl
Identify cases with ICD9_CODE represent Heart Failure

In [5]:
labels = diagnose.groupby('SUBJECT_ID')['ICD9_CODE'].apply(lambda x: '4280' in x.tolist()).sort_index().astype(int).tolist()

In [6]:
len(labels) # total patient

49984

In [76]:
sum(labels) # heart failure cases

10807

In [77]:
diagnose['ICD9_FAC'] = pd.factorize(diagnose['ICD9_CODE'])[0]

In [78]:
diagnose.loc[diagnose.ICD9_CODE == '4280',]

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ICD9_FAC
49,49,3,100004,3,4280,44
93,93,7,100008,1,4280,44
123,123,9,100011,0,4280,44
326,326,25,100029,2,4280,44
394,394,33,100037,1,4280,44
...,...,...,...,...,...,...
558243,558243,49991,154282,3,4280,44
558256,558256,49991,154283,1,4280,44
558281,558281,49991,154284,2,4280,44
558310,558310,49993,154286,2,4280,44


## Create sequences.pkl

In [79]:
def filter_icd_values(group):
    icd_values = group['ICD9_FAC'].tolist()
    index_44 = len(icd_values)  # initialize index as the length of the list
    for i, icd in enumerate(icd_values):
        if icd == 44:
            index_44 = min(index_44, i)
    if index_44 == len(icd_values):
        return icd_values  # '4280' not found in the list
    else:
        return icd_values[:index_44]

In [80]:
seq = diagnose.groupby('SUBJECT_ID').apply(filter_icd_values).tolist()

In [81]:
max(sum(seq, [])) # the number of uniqe ICD-9 codes in MediSyn data

5887

## Create time.pkl

As this dataset is generalized using GAN model, so the admission date is not continuous for patients. So we generated the time stamp and duration between visits uising admission ID (HADM_ID) with the assumption that time duration between two visits of a same patient is about three months.

In [7]:
# Calculate the minimum value of column HADM_ID within each group
min_HADM_ID = diagnose.groupby('SUBJECT_ID')['HADM_ID'].transform('min')

# Calculate the difference between column B and the minimum value of B within each group
diagnose['TIME_STAMP'] = (diagnose['HADM_ID'] - min_HADM_ID)*90 + 1

In [9]:
diagnose['TIME_STAMP'].max()

1171

In [83]:
times = diagnose['TIME_STAMP'].tolist()

## Save pkl data for both Python 2 and Python 3

In [84]:
import pickle

# Open a file for writing in binary mode
with open("MediSyn_pkl/MS_labels.pkl", "wb") as f:
    # Use pickle.dump to serialize the list and write it to the file
    pickle.dump(labels, f, protocol=2)
    
# Open a file for writing in binary mode
with open("MediSyn_pkl/MS_sequences.pkl", "wb") as f:
    # Use pickle.dump to serialize the list and write it to the file
    pickle.dump(seq, f, protocol=2)
    
# Open a file for writing in binary mode
with open("MediSyn_pkl/MS_times.pkl", "wb") as f:
    # Use pickle.dump to serialize the list and write it to the file
    pickle.dump(times, f, protocol=2)

In [85]:
import pickle

# Open a file for writing in binary mode
with open("MediSyn_pkl/MS_labels_py3.pkl", "wb") as f:
    # Use pickle.dump to serialize the list and write it to the file
    pickle.dump(labels, f)
    
# Open a file for writing in binary mode
with open("MediSyn_pkl/MS_sequences_py3.pkl", "wb") as f:
    # Use pickle.dump to serialize the list and write it to the file
    pickle.dump(seq, f)
    
# Open a file for writing in binary mode
with open("MediSyn_pkl/MS_times_py3.pkl", "wb") as f:
    # Use pickle.dump to serialize the list and write it to the file
    pickle.dump(times, f)

# 2. Synthetic Data Glance

## Labels

In [2]:
import pickle

with open('labels.pkl', 'rb') as f:
    labels = pickle.load(f)

In [49]:
labels[13]

1

In [3]:
import collections
# Count the occurrences of each value in the list
labels_dict = collections.Counter(labels)

# Print the result
for value, count in labels_dict.items():
    print(f"{value}: {count}")

0: 92
1: 8


## Sequences

In [39]:
with open('sequences.pkl', 'rb') as f:
    sequences = pickle.load(f)

In [23]:
len(sequences)

100

## Time duration

In [6]:
with open('times.pkl', 'rb') as f:
    times = pickle.load(f, encoding='latin1')

In [31]:
len(times)

100

## Embedding

In [27]:
with open('emb.pkl', 'rb') as f:
    emb = pickle.load(f, encoding='latin1')

In [28]:
emb

array([[-0.00104703, -0.00018714,  0.00812972, ..., -0.003401  ,
         0.00230845,  0.00485769],
       [-0.00110725,  0.00222372, -0.00311453, ...,  0.00434983,
         0.00357147,  0.00088711],
       [ 0.00985814,  0.00543434,  0.00257608, ...,  0.00998995,
        -0.00293335, -0.00990275],
       ...,
       [-0.00400008,  0.00765756, -0.00077224, ..., -0.00690673,
        -0.00307302, -0.00459354],
       [ 0.00655403,  0.00189304,  0.0076644 , ...,  0.0084704 ,
        -0.00972553,  0.00065817],
       [ 0.00982774, -0.00962515,  0.00661188, ...,  0.00898594,
        -0.00266903,  0.00202336]])

# Model Running Log

>python gru_onehot.py sequences.pkl labels.pkl output1 && \

>Loading data ...  done!!<br />
Building the model ...  done!!<br />
Constructing the optimizer ...  done!!<br />
Optimization start !!<br />
epoch:0, valid_auc:0.222222<br />
Currenlty the best test_auc:0.777778<br />
epoch:1, valid_auc:0.222222<br />
epoch:2, valid_auc:0.222222<br />
epoch:3, valid_auc:0.222222<br />
epoch:4, valid_auc:0.222222<br />
...

>python gru_onehot_time.py sequences.pkl times.pkl labels.pkl output2 && \

>Loading data ...  done!!<br />
Building the model ...  done!!<br />
Constructing the optimizer ...  done!!<br />
Optimization start !!<br />
epoch:0, valid_auc:0.625000<br />
Currenlty the best test_auc:0.444444<br />
epoch:1, valid_auc:0.625000<br />
epoch:2, valid_auc:0.625000<br />
epoch:3, valid_auc:0.625000<br />
epoch:4, valid_auc:0.625000<br />
...

>python gru_emb.py sequences.pkl labels.pkl emb.pkl output3

>Building the model ...  done!!<br />
Constructing the optimizer ...  done!!<br />
Optimization start !!<br />
epoch:0, valid_auc:0.777778<br />
Currenlty the best test_auc:0.333333<br />
epoch:1, valid_auc:0.777778<br />
epoch:2, valid_auc:0.444444<br />
epoch:3, valid_auc:0.444444<br />
epoch:4, valid_auc:0.444444<br />
...

>python gru_emb_time.py sequences.pkl times.pkl labels.pkl emb.pkl output4

>Loading data ...  done!!<br />
Building the model ...  done!!<br />
Constructing the optimizer ...  done!!<br />
Optimization start !!<br />
epoch:0, valid_auc:0.444444<br />
Currenlty the best test_auc:0.250000<br />
epoch:1, valid_auc:0.444444<br />
epoch:2, valid_auc:0.444444<br />
epoch:3, valid_auc:0.444444<br />
epoch:4, valid_auc:0.444444<br />
...

# Runtime Documentation

Model Name Runtime


GRU_onehot 2 hrs 53 min


GRU_onehot_time 2 hrs 38 min


GRU_emb 2 hrs 39 min


GRU_emb_time 2 hrs 56 min



# Output Data Glandce

In [7]:
# Load the .npz file
data = np.load('output1.npz')

data.keys()
for key in data.keys():
    print(key)

b_logistic
b_gru
W_gru
W_logistic
U_gru


In [8]:
# Load the .npz file
data = np.load('output2.npz')

data.keys()
for key in data.keys():
    print(key)

b_logistic
b_gru
W_gru
W_logistic
U_gru


In [9]:
# Load the .npz file
data = np.load('output3.npz')

data.keys()
for key in data.keys():
    print(key)

b_logistic
b_gru
W_gru
W_logistic
U_gru


In [10]:
# Load the .npz file
data = np.load('output4.npz')

data.keys()
for key in data.keys():
    print(key)

b_logistic
b_gru
W_gru
W_logistic
U_gru


# 3. Build GRU, KNN and SVM using python 3

As the model and codes developed by the author was based on Python 2 and the package was out of date and running time was long on our PCs, we developed our own models and codes based on Python 3 with PyTorch. 


The authors only provided the codes for GRU models based on Python 2. We then enlarged our dataset and developed our own GRU, KNN and SVM model based on Python 3 and PyTorch.


The codes can be found in our repo
https://github.com/GermanGGarzon/rnn_predict_hf_Luna_German
