In [49]:
import pickle as pkl
from gensim.models.word2vec import Word2Vec
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
import os.path
import pandas as pd
from collections import Counter
import gzip
from sklearn.preprocessing import MultiLabelBinarizer


DATA_DIR = '../data/'
random.seed(3778)

In [2]:
# load data from saved files - preprocessed data for notes and diagnosis codes

df_diag_diabetes_hadm_ids = pkl.load(open(f'{DATA_DIR}diag_diabetes_hadm_ids.p','rb'))
print('df_diag_diabetes_hadm_ids.shape:   ', df_diag_diabetes_hadm_ids.shape)

with gzip.open(f'{DATA_DIR}diag_icd9.csv.gz', "rb") as f:
    df_diag_icd9 = pkl.load(f)
print('df_diag_icd9.shape:                ', df_diag_icd9.shape)

with gzip.open(f'{DATA_DIR}diag_icd9_rolled.csv.gz', "rb") as f:
    df_diag_icd9_rolled = pkl.load(f)
print('df_diag_icd9_rolled.shape:         ', df_diag_icd9_rolled.shape)

with gzip.open(f'{DATA_DIR}notes_final.gz', "rb") as f:
    df_notes = pkl.load(f)
print('df_notes.shape:                    ', df_notes.shape)

notes_tokens_list = pkl.load(open(f'{DATA_DIR}notes_tokens_list.p','rb'))
print('notes_tokens_list.length:          ', len(notes_tokens_list))

icd9_unique_list = pkl.load(open(f'{DATA_DIR}diag_icd9_unique_list.p','rb'))
print('icd9_unique_list.len:              ', len(icd9_unique_list))

icd9_rolled_unique_list = pkl.load(open(f'{DATA_DIR}diag_icd9_rolled_unique_list.p','rb'))
print('icd9_rolled_unique_list.len:       ', len(icd9_rolled_unique_list))

df_diag_diabetes_hadm_ids.shape:    (14222,)
df_diag_icd9.shape:                 (14222, 3)
df_diag_icd9_rolled.shape:          (14222, 3)
df_notes.shape:                     (399623, 7)
notes_tokens_list.length:           53229
icd9_unique_list.len:               4103
icd9_rolled_unique_list.len:        781


In [3]:
assert len(df_notes['NTOKENS'][0]) == df_notes['NTOKENS_LEN'][0], 'length of 1st sequences does not match, incorrect data'
assert len(df_notes['NTOKENS'][1]) == df_notes['NTOKENS_LEN'][1], 'length of 2nd sequences does not match, incorrect data'
assert len(df_notes['NTOKENS'][2]) == df_notes['NTOKENS_LEN'][2], 'length of 3rd sequences does not match, incorrect data'
assert len(df_notes['NTOKENS'][399622]) == df_notes['NTOKENS_LEN'][399622], 'length of 399622 sequences does not match, incorrect data'

In [4]:
# Statistics for comparison with original paper statistics after preprocessing

print('Num. of used records             ', df_notes.shape[0])
print('Num. of regular labels           ',len(icd9_unique_list))
print('Num. of rolled up labels         ',len(icd9_rolled_unique_list))
print('Num. of unique tokens            ',len(notes_tokens_list))
print('Avg. num. of tokens per report   ',df_notes['NTOKENS_LEN'].sum() / len(df_notes['NTOKENS_LEN']))

Num. of used records              399623
Num. of regular labels            4103
Num. of rolled up labels          781
Num. of unique tokens             53229
Avg. num. of tokens per report    309.05638814582744


In [5]:
df_notes_icd9 = pd.merge(df_notes, df_diag_icd9, on=['HADM_ID'], how='inner').drop(columns = ['NTOKENS_LEN'])
df_notes_icd9['ICD9_CODE_LEN'] = df_notes_icd9['ICD9_CODE'].apply(len)

df_notes_icd9_rolled = pd.merge(df_notes, df_diag_icd9_rolled, on=['HADM_ID'], how='inner').drop(columns = [ 'NTOKENS_LEN'])
df_notes_icd9_rolled['ICD9_CODE_ROLLED_LEN'] = df_notes_icd9_rolled['ICD9_CODE_ROLLED'].apply(len)
print('df_notes_icd9.shape        : ', df_notes_icd9.shape)
print('df_notes_icd9_rolled.shape : ', df_notes_icd9_rolled.shape)

# print(df_notes_icd9.head(2))
# print(df_notes_icd9_rolled.head(2))

print('Average regular labels per report:', df_notes_icd9['ICD9_CODE_LEN'].sum()/len(df_notes_icd9['ICD9_CODE_LEN']))
print('Average rolled labels per report:', df_notes_icd9_rolled['ICD9_CODE_ROLLED_LEN'].sum()/len(df_notes_icd9_rolled['ICD9_CODE_ROLLED_LEN']))

df_notes_icd9.shape        :  (399623, 9)
df_notes_icd9_rolled.shape :  (399623, 9)
Average regular labels per report: 17.409070548992425
Average rolled labels per report: 15.746078178683408


In [6]:
# list of all notes
text_list = df_notes['NTOKENS'].to_list()

assert len(text_list[0]) == df_notes['NTOKENS_LEN'][0], 'length of 1st sequences does not match, incorrect data'
assert len(text_list[1]) == df_notes['NTOKENS_LEN'][1], 'length of 2nd sequences does not match, incorrect data'
assert len(text_list[2]) == df_notes['NTOKENS_LEN'][2], 'length of 3rd sequences does not match, incorrect data'


In [7]:
w2v_model_file = f'{DATA_DIR}word2vec_model.model'
word2vec_file = f'{DATA_DIR}word2vec.vec'

if os.path.exists(w2v_model_file):
  # read from saved file
  model_w2v = Word2Vec.load(w2v_model_file)
  print('read from saved model file: ', model_w2v)
else:
  # train Word2Vec
  model_w2v = Word2Vec(text_list, vector_size=300, min_count=1,workers=4)
  print('model trained')
  model_w2v.save(w2v_model_file)


model trained


NameError: name 'model' is not defined

In [8]:
  model_w2v.wv.save_word2vec_format(word2vec_file, binary=False)
  print('model saved')

  # list of words not in word2vec vocab
  dict1 = model_w2v.wv.index_to_key
  dict2 = notes_tokens_list.keys()

  missing_word_in_w2v_vocab = []
  print('tokens not in w2v vocab: ', len(dict2 - dict1))
  for x in (dict2 - dict1):
    missing_word_in_w2v_vocab.append((x, notes_tokens_list[x])) 
  
  print('missing words in vocab: ', missing_word_in_w2v_vocab)

model saved
tokens not in w2v vocab:  26
missing words in vocab:  [('cefdinir', 7), ('reinnervation', 10), ('apoplexy', 5), ('burgdorferi', 6), ('fidaxomicin', 5), ('oxymorphone', 8), ('megakaryocytic', 6), ('larygneal', 5), ('granulicatella', 5), ('gdnxl', 14), ('foliaceus', 10), ('ifgdnxl', 22), ('arrangment', 5), ('ampliprep', 6), ('dasatinib', 10), ('itsfq', 21), ('hnxcjojqj', 23), ('quantitaton', 5), ('subcutis', 5), ('petites', 7), ('fditsfq', 13), ('holdblu', 10), ('barrx', 6), ('discontinuance', 5), ('rivaroxaban', 5), ('hepatocholangiolar', 5)]


In [9]:
print(model_w2v)

Word2Vec(vocab=53203, vector_size=300, alpha=0.025)


In [17]:
# Create word embedding matrix
embedding_matrix = model_w2v.wv[model_w2v.wv.index_to_key]
print('embedding_matrix.shape: ', embedding_matrix.shape)

embedding_matrix.shape:  (53203, 300)


In [18]:
# Create dict for embedding matrix (word <-> row)
row_dict=dict({word:idx + 1 for idx,word in enumerate(model_w2v.wv.index_to_key)})
print('row_dict.length: ', len(row_dict))
# Create and map unknown and padding tokens to null
embedding_matrix = np.concatenate((embedding_matrix, np.zeros((2,300))), axis=0)
row_dict['_unknown_'] = len(model_w2v.wv.index_to_key) + 1
row_dict['_padding_'] = 0
print('row_dict.length after padding: ', len(row_dict))

row_dict.length:  53203
row_dict.length after padding:  53205


In [20]:
row_index_dictionary_file = f'{DATA_DIR}row_index_dictionary.p'
embedding_matrix_file = f'{DATA_DIR}embedding_matrix.p'

# save embedded matrix and row_dict to file
if not os.path.exists(row_index_dictionary_file):
  pkl.dump(row_dict, open(row_index_dictionary_file, 'wb'))
else:
  # read from saved file
  print('reading row_dict from file: ', row_index_dictionary_file)
  row_dict = pkl.load(open(row_index_dictionary_file, 'rb'))
  print('row_dict.length after padding: ', len(row_dict))

if not os.path.exists(embedding_matrix_file):
  pkl.dump(embedding_matrix, open(embedding_matrix_file, 'wb'))
else:
  # read from saved file
  print('reading embedding_matrix from file: ', embedding_matrix_file)
  embedding_matrix = pkl.load(open(embedding_matrix_file, 'rb'))
  print('embedding_matrix.shape: ', embedding_matrix.shape)


reading row_dict from file:  ../data/row_index_dictionary.p
row_dict.length after padding:  53205
reading embedding_matrix from file:  ../data/embedding_matrix.p
embedding_matrix.shape:  (53205, 300)


In [23]:
def convert_token_to_index(tokens, row_dict):
    return [row_dict.get(token, row_dict['_unknown_']) for token in tokens]

In [24]:
MAX_LENGTH = 2199

indexed_notes = (df_notes['NTOKENS']
      .apply(convert_token_to_index, row_dict=row_dict)
      .apply(lambda x: np.squeeze(pad_sequences([x], padding = 'pre', truncating = 'post', 
      maxlen = MAX_LENGTH, value = row_dict['_padding_']))))
      
print(type(indexed_notes))

<class 'pandas.core.series.Series'>


In [41]:
X = np.vstack(indexed_notes.to_list())
print('X.type: ',type(X))
print('X.shape: ', X.shape)

X.type:  <class 'numpy.ndarray'>
X.shape:  (399623, 2199)


In [42]:
Y_ICD9 = np.vstack(df_notes_icd9['ICD9_CODE_MLB'].to_numpy())
print('Y_ICD9.type: ',type(Y_ICD9), type(Y_ICD9[0]), type(Y_ICD9[0][0]))
print('Y_ICD9.shape: ', Y_ICD9.shape)

Y_ICD9.type:  <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.int64'>
Y_ICD9.shape:  (399623, 4103)


In [43]:
Y_ICD9_ROLLED = np.vstack(df_notes_icd9_rolled['ICD9_CODE_ROLLED_MLB'].to_numpy())
print('Y_ICD9_ROLLED.type: ',type(Y_ICD9_ROLLED), type(Y_ICD9_ROLLED[0]), type(Y_ICD9_ROLLED[0][0]))
print('Y_ICD9_ROLLED.shape: ', Y_ICD9_ROLLED.shape)

Y_ICD9_ROLLED.type:  <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.int64'>
Y_ICD9_ROLLED.shape:  (399623, 781)


In [69]:
unique_categories = df_notes['CATEGORY'].unique()
unique_categories.sort()

print('unique_categories len: ', len(unique_categories), unique_categories)

cats = np.zeros((len(df_notes['CATEGORY']), len(unique_categories)), dtype=np.float32)
for i, cat in enumerate(df_notes['CATEGORY']):
  cats[i][np.where(unique_categories == cat)] = 1

notes_categories = cats

unique_categories len:  15 ['Case Management ' 'Consult' 'Discharge summary' 'ECG' 'Echo' 'General'
 'Nursing' 'Nursing/other' 'Nutrition' 'Pharmacy' 'Physician ' 'Radiology'
 'Rehab Services' 'Respiratory ' 'Social Work']


In [70]:
shuffled_indices_file = f'{DATA_DIR}shuffled_indices.npy'

# Shuffle
if os.path.exists(shuffled_indices_file):
    print('read from indices file', )
    indices = np.load(shuffled_indices_file)
else:
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    np.save(shuffled_indices_file, indices)

print (len(indices), indices[5])

X_SH = X[indices]
CAT_SH = notes_categories[indices]
Y_ICD9_SH = Y_ICD9[indices]
Y_ICD9_ROLLED_SH = Y_ICD9_ROLLED[indices]

read from indices file
399623 62930


In [76]:
data_file = f'{DATA_DIR}data.npz'

if os.path.exists(data_file):
  print('loading from save data file: ', data_file)
  data = np.load(f'{DATA_DIR}data.npz')
  X_SH = data['x']
  CAT_SH = data['cats']
  Y_ICD9_SH = data['reg_y']
  Y_ICD9_ROLLED_SH = data['rol_y']
  print(X_SH.shape, CAT_SH.shape, Y_ICD9_SH.shape, Y_ICD9_ROLLED_SH.shape)
else:
  np.savez_compressed(f'{DATA_DIR}data.npz',
            x=X_SH, cats=CAT_SH,
            reg_y=Y_ICD9_SH, rol_y=Y_ICD9_ROLLED_SH)

loading from save data file:  ../data/data.npz
(399623, 2199) (399623, 15) (399623, 4103) (399623, 781)
