In [1]:
import pickle as pkl
from gensim.models import Word2Vec
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import os.path
import pandas as pd
from collections import Counter
import bz2


DATA_DIR = '../data/'
random.seed(3778)

In [2]:
# load data from saved files - preprocessed data for notes and diagnosis codes

df_diag_diabetes_hadm_ids = pkl.load(open(f'{DATA_DIR}diag_diabetes_hadm_ids.p','rb'))
print('df_diag_diabetes_hadm_ids.shape:   ', df_diag_diabetes_hadm_ids.shape)

df_diag_icd9 = pkl.load(open(f'{DATA_DIR}diag_icd9.p','rb'))
print('df_diag_icd9.shape:                ', df_diag_icd9.shape)

df_diag_icd9_rolled = pkl.load(open(f'{DATA_DIR}diag_icd9_rolled.p','rb'))
print('df_diag_icd9_rolled.shape:         ', df_diag_icd9_rolled.shape)

df_notes = pkl.load(open(f'{DATA_DIR}notes_final.p','rb'))
print('df_notes.shape:                    ', df_notes.shape)

notes_tokens_list = pkl.load(open(f'{DATA_DIR}notes_tokens_list.p','rb'))
print('notes_tokens_list.length:          ', len(notes_tokens_list))

icd9_unique_list = pkl.load(open(f'{DATA_DIR}diag_icd9_unique_list.p','rb'))
print('icd9_unique_list.len:              ', len(icd9_unique_list))

icd9_rolled_unique_list = pkl.load(open(f'{DATA_DIR}diag_icd9_rolled_unique_list.p','rb'))
print('icd9_rolled_unique_list.len:       ', len(icd9_rolled_unique_list))

df_diag_diabetes_hadm_ids.shape:    (14222,)
df_diag_icd9.shape:                 (14222, 3)
df_diag_icd9_rolled.shape:          (14222, 3)
df_notes.shape:                     (399631, 8)
notes_tokens_list.length:           53229
icd9_unique_list.len:               4103
icd9_rolled_unique_list.len:        781


In [9]:
# Statistics for comparison with original paper statistics after preprocessing

print('Num. of used records             ', df_notes.shape[0])
print('Num. of regular labels           ',len(icd9_unique_list))
print('Num. of rolled up labels         ',len(icd9_rolled_unique_list))
print('Num. of unique tokens            ',len(notes_tokens_list))
print('Avg. num. of tokens per report   ',df_notes['NTOKENS_LEN'].sum() / len(df_notes['NTOKENS_LEN']))

Num. of used records              399631
Num. of regular labels            4103
Num. of rolled up labels          781
Num. of unique tokens             53229
Avg. num. of tokens per report    309.09424193818796


In [21]:
# list of all notes
text_list = df_notes['NTOKENS'].to_list()
print('text_list len: ', len(text_list))

text_list len:  399631


In [46]:
df_notes_icd9 = pd.merge(df_notes, df_diag_icd9, on=['HADM_ID'], how='inner').drop(columns = ['TEXT', 'TOKENS', 'SUBJECT_ID', 'NTOKENS_LEN'])
df_notes_icd9_rolled = pd.merge(df_notes, df_diag_icd9_rolled, on=['HADM_ID'], how='inner').drop(columns = ['TEXT', 'TOKENS', 'SUBJECT_ID', 'NTOKENS_LEN'])
print('df_notes_icd9.shape        : ', df_notes_icd9.shape)
print('df_notes_icd9_rolled.shape : ', df_notes_icd9_rolled.shape)

# print(df_notes_icd9.head(2))
# print(df_notes_icd9_rolled.head(2))

df_notes_icd9.shape        :  (399631, 6)
df_notes_icd9_rolled.shape :  (399631, 6)


In [12]:
w2v_model_file = f'{DATA_DIR}word2vec_model.model'

if os.path.exists(w2v_model_file):
  # read from saved file
  model_w2v = Word2Vec.load(w2v_model_file)
  print('read from saved model file: ', model_w2v)
else:
  # initialize Word2Vec
  model_w2v = Word2Vec(min_count=1, vector_size=300, workers=4, sg=1, seed=3778)
  print('model initialized: ',model_w2v)

  model_w2v.build_vocab(text_list)
  print('model vacab created: ',model_w2v)

  # list of words not in word2vec vocab
  dict1 = model_w2v.wv.index_to_key
  dict2 = notes_tokens_list.keys()
  print('tokens not in w2v vocab: ', dict2 - dict1 )

  # Word2Vec model training - trained model saved
  model_w2v.train(text_list, 
                  total_examples=model_w2v.corpus_count, 
                  epochs=model_w2v.epochs)
  # Write to file 
  model_w2v.save(w2v_model_file)


read from saved model file:  Word2Vec(vocab=53203, vector_size=300, alpha=0.025)


In [13]:
# Create word embedding matrix
embedding_matrix = model_w2v.wv[model_w2v.wv.index_to_key]
print('embedding_matrix.shape: ', embedding_matrix.shape)

embedding_matrix.shape:  (53203, 300)


In [14]:
# Create dict for embedding matrix (word <-> row)
row_dict=dict({word:idx for idx,word in enumerate(model_w2v.wv.index_to_key)})
print('row_dict.length: ', len(row_dict))
# Create and map unknown and padding tokens to null
embedding_matrix = np.concatenate((embedding_matrix, np.zeros((2,300))), axis=0)
row_dict['_unknown_'] = len(model_w2v.wv.index_to_key)
row_dict['_padding_'] = len(model_w2v.wv.index_to_key) + 1
print('row_dict.length after padding: ', len(row_dict))

row_dict.length:  53203
row_dict.length after padding:  53205


In [15]:
# save embedded matrix and row_dict to file
# pkl.dump(row_dict, open(f'{DATA_DIR}row_index_dictionary.p', 'wb'))
# pkl.dump(embedding_matrix, open(f'{DATA_DIR}embedded_matrix.p', 'wb'))

# read from saved file
row_dict = pkl.load(open(f'{DATA_DIR}row_index_dictionary.p', 'rb'))
embedding_matrix = pkl.load(open(f'{DATA_DIR}embedded_matrix.p', 'rb'))
print('row_dict.length after padding: ', len(row_dict))
print('embedding_matrix.shape: ', embedding_matrix.shape)

row_dict.length after padding:  53205
embedding_matrix.shape:  (53203, 300)


In [16]:
def convert_token_to_index(tokens, row_dict):
    return [row_dict.get(token, row_dict['_unknown_']) for token in tokens]

In [48]:
MAX_LENGTH = 2200

indexed_notes = (df_notes['NTOKENS']
      .apply(convert_token_to_index, row_dict=row_dict)
      .apply(lambda x: np.squeeze(pad_sequences([x], padding = 'post', truncating = 'post', 
      maxlen = MAX_LENGTH, value = row_dict['_padding_']))))
      
print(type(indexed_notes))

<class 'pandas.core.series.Series'>


In [52]:
X = np.vstack(indexed_notes.to_list())
print('X.type: ',type(X))
print('X.shape: ', X.shape)

X.type:  <class 'numpy.ndarray'>
X.shape:  (399631, 2200)


In [58]:
df_notes_icd9['INDEXED_TOKENS'] = [x for x in X]
print('df_notes_icd9.shape: ', df_notes_icd9.shape)
# print(df_notes_icd9.head(2))

df_notes_icd9.shape:  (399631, 7)


In [59]:
df_notes_icd9_rolled['INDEXED_TOKENS'] = [x for x in X]
print('df_notes_icd9_rolled.shape: ', df_notes_icd9_rolled.shape)
# print(df_notes_icd9_rolled.head(2))

df_notes_icd9_rolled.shape:  (399631, 7)


In [71]:
# validation
t1 = dict(Counter(df_notes_icd9['INDEXED_TOKENS'][0]))
t2 = dict(Counter(df_notes_icd9['INDEXED_TOKENS'][1]))
print('2200 - 428 = ', t1[53204],',', '2200 - 306 = ', t2[53204])

t1 = dict(Counter(df_notes_icd9_rolled['INDEXED_TOKENS'][0]))
t2 = dict(Counter(df_notes_icd9_rolled['INDEXED_TOKENS'][1]))
print('2200 - 428 = ', t1[53204],',', '2200 - 306 = ', t2[53204])

2200 - 428 =  1772 , 2200 - 306 =  1894
2200 - 428 =  1772 , 2200 - 306 =  1894


In [89]:
Y_ICD9 = np.vstack(df_notes_icd9['ICD9_CODE_MLB'].to_numpy())
print('Y_ICD9.type: ',type(Y_ICD9))
print('Y_ICD9.shape: ', Y_ICD9.shape)

Y_ICD9.type:  <class 'numpy.ndarray'>
Y_ICD9.shape:  (399631, 4103)


In [90]:
Y_ICD9_ROLLED = np.vstack(df_notes_icd9_rolled['ICD9_CODE_ROLLED_MLB'].to_numpy())
print('Y_ICD9_ROLLED.type: ',type(Y_ICD9_ROLLED))
print('Y_ICD9_ROLLED.shape: ', Y_ICD9_ROLLED.shape)

Y_ICD9_ROLLED.type:  <class 'numpy.ndarray'>
Y_ICD9_ROLLED.shape:  (399631, 781)


In [None]:
# save data and label to file
pkl.dump(X, open(f'{DATA_DIR}X_NOTES_INDEXED.p', 'wb'))
pkl.dump(Y_ICD9, open(f'{DATA_DIR}Y_ICD9.p', 'wb'))
pkl.dump(Y_ICD9_ROLLED, open(f'{DATA_DIR}Y_ICD9_ROLLED.p', 'wb'))

In [2]:
# read from saved file

X = pkl.load(open(f'{DATA_DIR}X_NOTES_INDEXED.p', 'rb'))
print('X.type: ',type(X))
print('X.shape: ', X.shape)

Y_ICD9 = pkl.load(open(f'{DATA_DIR}Y_ICD9.p', 'rb'))
print('Y_ICD9.type: ',type(Y_ICD9))
print('Y_ICD9.shape: ', Y_ICD9.shape)

Y_ICD9_ROLLED = pkl.load(open(f'{DATA_DIR}Y_ICD9_ROLLED.p', 'rb'))
print('Y_ICD9_ROLLED.type: ',type(Y_ICD9_ROLLED))
print('Y_ICD9_ROLLED.shape: ', Y_ICD9_ROLLED.shape)

X.type:  <class 'numpy.ndarray'>
X.shape:  (399631, 2200)
Y_ICD9.type:  <class 'numpy.ndarray'>
Y_ICD9.shape:  (399631, 4103)
Y_ICD9_ROLLED.type:  <class 'numpy.ndarray'>
Y_ICD9_ROLLED.shape:  (399631, 781)


In [5]:
# save data and label to file
pkl.dump(X, bz2.BZ2File(f'{DATA_DIR}X_NOTES_INDEXED.bz2', 'wb'))
print('X saved')
# pkl.dump(Y_ICD9, bz2.BZ2File(f'{DATA_DIR}Y_ICD9.bz2', 'wb'))
# print('Y_ICD9 saved')
# pkl.dump(Y_ICD9_ROLLED, bz2.BZ2File(f'{DATA_DIR}Y_ICD9_ROLLED.bz2', 'wb'))
# print('Y_ICD9 saved')


In [4]:
Y_ICD9_ROLLED = pkl.load(bz2.BZ2File(f'{DATA_DIR}Y_ICD9_ROLLED.bz2', 'rb'))
print('Y_ICD9_ROLLED.type: ',type(Y_ICD9_ROLLED))
print('Y_ICD9_ROLLED.shape: ', Y_ICD9_ROLLED.shape)

Y_ICD9_ROLLED.type:  <class 'numpy.ndarray'>
Y_ICD9_ROLLED.shape:  (399631, 781)
