In [1]:
import pandas as pd
import pickle as pkl
import string
import Levenshtein as lev
from collections import Counter
import os.path
import gzip


DATA_DIR = '../data/'

In [2]:
# Define the function to remove the punctuation, except Apostrophe

mypunctuation = string.punctuation.replace("'", "")

def remove_punctuations(text):
    for punctuation in mypunctuation:
        text = text.replace(punctuation, ' ')
    return text


In [3]:
# Unique HADM IDs with Diabetes
df_hdm_id_diabetes = pkl.load(open(f'{DATA_DIR}diag_diabetes_hadm_ids.p','rb'))

print(df_hdm_id_diabetes.shape)
print(df_hdm_id_diabetes.head(2))

(14222,)
0    100001
1    100009
Name: HADM_ID, dtype: int64


In [4]:
df_notes = (pd.read_csv(f'{DATA_DIR}NOTEEVENTS.csv.gz', low_memory=False)
                [['SUBJECT_ID','HADM_ID','CATEGORY', 'DESCRIPTION', 'TEXT']])
print(df_notes.shape)
print(df_notes.head(2))

(2083180, 5)
   SUBJECT_ID   HADM_ID           CATEGORY DESCRIPTION  \
0       22532  167853.0  Discharge summary      Report   
1       13702  107527.0  Discharge summary      Report   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  


In [5]:
df_notes_diabetes = pd.merge(df_notes, df_hdm_id_diabetes, on=['HADM_ID'], how='inner')

print(df_notes_diabetes.shape)
print(df_notes_diabetes.head(2))

(406203, 5)
   SUBJECT_ID   HADM_ID           CATEGORY DESCRIPTION  \
0       28063  121936.0  Discharge summary      Report   
1       28063  121936.0               Echo      Report   

                                                TEXT  
0  Admission Date:  [**2125-2-9**]              D...  
1  PATIENT/TEST INFORMATION:\nIndication: Aortic ...  


In [6]:
df_notes_diabetes['TEXT'] = df_notes_diabetes['TEXT'].apply(remove_punctuations).replace('[\d]', 'd',regex=True)
df_notes_diabetes['TEXT'] = df_notes_diabetes['TEXT'].str.lower()

df_notes_diabetes['TEXT'].head(2)

0    admission date      dddd d d                 d...
1    patient test information \nindication  aortic ...
Name: TEXT, dtype: object

In [7]:
df_notes_diabetes['TOKENS'] = df_notes_diabetes['TEXT'].str.split()
print(df_notes_diabetes.head(2))

   SUBJECT_ID   HADM_ID           CATEGORY DESCRIPTION  \
0       28063  121936.0  Discharge summary      Report   
1       28063  121936.0               Echo      Report   

                                                TEXT  \
0  admission date      dddd d d                 d...   
1  patient test information \nindication  aortic ...   

                                              TOKENS  
0  [admission, date, dddd, d, d, discharge, date,...  
1  [patient, test, information, indication, aorti...  


In [8]:
word_freq_dict_list = []

for idx, tokens in enumerate(df_notes_diabetes['TOKENS']):
  word_freq_dict_list.append(dict(Counter(tokens)))

print(len(word_freq_dict_list))

406203


In [9]:
# merge list of word frequency dictionaries created for each notes - to create overall word frequencies

# function to merge only dictionaries from dictionary list whose lenth is between min_len and max_len
def createWordCorpus(dictList, min_len, max_len, wordCorpus):
  for idx, dict1 in enumerate(dictList):
    lent = len(dict1)
    if lent > min_len and lent <= max_len:
      for key, value in dict1.items():
        if key in wordCorpus:
          wordCorpus[key] = wordCorpus[key] + dict1[key]
        else:
          wordCorpus[key] = dict1[key]
  return wordCorpus

# check stats for dictionary length
len_200 = 0
len_500 = 0
len_1000 = 0
len_2000 = 0
len_rest = 0

for dict1 in word_freq_dict_list:
  lent = len(dict1)
  if lent > 0 and lent <= 200:
    len_200 = len_200 + 1
  elif lent > 200 and lent <= 500:
    len_500 = len_500 + 1
  elif lent > 500 and lent <= 1000:
    len_1000 = len_1000 + 1
  elif lent > 1000 and lent <= 2000:
    len_2000 = len_2000 + 1
  else:
    len_rest = len_rest + 1

print(len_200, len_500, len_1000, len_2000, len_rest)


wordCorpus_file = f'{DATA_DIR}wordCorpus.p'

if not os.path.exists(wordCorpus_file):
  wordCorpus = {}
  wordCorpus = createWordCorpus(word_freq_dict_list, 0, 200, wordCorpus)
  print(len(wordCorpus))

  wordCorpus = createWordCorpus(word_freq_dict_list, 200, 500, wordCorpus)
  print(len(wordCorpus))

  wordCorpus = createWordCorpus(word_freq_dict_list, 500, 1000, wordCorpus)
  print(len(wordCorpus))

  wordCorpus = createWordCorpus(word_freq_dict_list, 1000, 100000, wordCorpus)
  print(len(wordCorpus))
  # write to file - word corpus with frequency - overall
  pkl.dump( wordCorpus, open(wordCorpus_file, "wb" ) )
else:
  # read from file - word corpus with frequency - overall
  print('reading from file: ', wordCorpus_file)
  wordCorpus = pkl.load(open(wordCorpus_file,'rb'))

print(len(wordCorpus))

303741 80749 20849 851 13
reading from file:  ../data/wordCorpus.p
162715


In [10]:
tokens_freq_lt_5 = {}
tokens_freq_gt_5 = {}

freq_limit = 5
for key, value in wordCorpus.items():
  if value < freq_limit:
    tokens_freq_lt_5[key] = value
  else:
    tokens_freq_gt_5[key] = value

print(len(tokens_freq_lt_5), len(tokens_freq_gt_5))

109486 53229


In [11]:
# Find closest match for a word in a word list using Levenshtein disctance
def closestMatch(candidateToken, wordList):
  min_dist = 99
  similar_word = ''
  for word in wordList:
    dist = lev.distance(candidateToken, word)
    if dist <= min_dist:
      min_dist = dist
      similar_word = word
  return candidateToken, similar_word, min_dist

In [12]:
# token mapping -> generate token mapping for presumbly misspelt word

word_token_map_file = f'{DATA_DIR}word_token_map.p'

if not os.path.exists(word_token_map_file):
  token_map = {}

  ii = 0
  for ct in tokens_freq_lt_5:
    candidateToken, similar_word, min_dist = closestMatch(ct, tokens_freq_gt_5)
    token_map[ct] = similar_word
    if ii % 10000 == 0:
      print(ii, candidateToken, similar_word, min_dist)
    ii = ii + 1
  # write to file - word corpus with frequency - overall
  pkl.dump(token_map, open(word_token_map_file, "wb" ) )
else:
  print('reading from ', word_token_map_file)
  # read from file - word corpus with frequency - overall
  token_map = pkl.load(open(word_token_map_file,'rb'))

print(len(token_map))
print(token_map['exacerbatiopn'])

reading from  ../data/word_token_map.p
109486
exacerbation


In [13]:
def map_tokens(tokens):
  n_tokens = []
  for token in tokens:
    if token in tokens_freq_lt_5:
      n_tokens.append(token_map[token])
    else:
      n_tokens.append(token)
  return n_tokens

In [14]:
df_notes_diabetes['NTOKENS'] = df_notes_diabetes['TOKENS'].apply(map_tokens)
print(df_notes_diabetes['NTOKENS'].head(5))

0    [admission, date, dddd, d, d, discharge, date,...
1    [patient, test, information, indication, aorti...
2    [patient, test, information, indication, aorti...
3    [sinus, rhythm, frequent, atrial, premature, b...
4    [rhythm, is, most, likely, sinus, rhythm, with...
Name: NTOKENS, dtype: object


In [15]:
# Calculate no of tokens for each report
df_notes_diabetes['NTOKENS_LEN'] = df_notes_diabetes['NTOKENS'].apply(lambda x: len(x))

In [16]:
# Filtering based on # tokens - GT 9 and LE 2200
df_notes_diabetes_final = df_notes_diabetes[(df_notes_diabetes['NTOKENS_LEN'] > 9) & (df_notes_diabetes['NTOKENS_LEN'] < 2200)].reset_index(drop=True)
print(df_notes_diabetes_final.shape)

df_notes_diabetes_filtered = df_notes_diabetes[(df_notes_diabetes['NTOKENS_LEN'] <= 9) | (df_notes_diabetes['NTOKENS_LEN'] >= 2200)].reset_index(drop=True)
print(df_notes_diabetes_filtered.shape)

print(df_notes_diabetes_final.head(2))

(399623, 8)
(6580, 8)
   SUBJECT_ID   HADM_ID CATEGORY DESCRIPTION  \
0       28063  121936.0     Echo      Report   
1       28063  121936.0     Echo      Report   

                                                TEXT  \
0  patient test information \nindication  aortic ...   
1  patient test information \nindication   aortic...   

                                              TOKENS  \
0  [patient, test, information, indication, aorti...   
1  [patient, test, information, indication, aorti...   

                                             NTOKENS  NTOKENS_LEN  
0  [patient, test, information, indication, aorti...          428  
1  [patient, test, information, indication, aorti...          306  


In [17]:
df_notes_diabetes_final = df_notes_diabetes_final.drop(columns=['TOKENS'])
print(df_notes_diabetes_final.shape)

(399623, 7)


In [18]:
print(type(df_notes_diabetes_final['NTOKENS'][0]))


<class 'list'>


In [19]:
notes_final_file = f'{DATA_DIR}notes_final.gz'

if os.path.exists(notes_final_file):
  # read saved notes_final
  print('reading saved notes_final_file: ', notes_final_file)
  with gzip.open(notes_final_file, "rb") as f:
      df_notes_diabetes_final = pkl.load(f)
else:
  # write to file
  with gzip.open(notes_final_file, "wb") as f:
      pkl.dump(df_notes_diabetes_final, f)

print(type(df_notes_diabetes_final['NTOKENS'][0]))

<class 'list'>


In [20]:
print(df_notes_diabetes_final.head(2))

   SUBJECT_ID   HADM_ID CATEGORY DESCRIPTION  \
0       28063  121936.0     Echo      Report   
1       28063  121936.0     Echo      Report   

                                                TEXT  \
0  patient test information \nindication  aortic ...   
1  patient test information \nindication   aortic...   

                                             NTOKENS  NTOKENS_LEN  
0  [patient, test, information, indication, aorti...          428  
1  [patient, test, information, indication, aorti...          306  


In [21]:
assert len(df_notes_diabetes_final['NTOKENS'][0]) == df_notes_diabetes_final['NTOKENS_LEN'][0], 'length of 1st sequences does not match, incorrect data'
assert len(df_notes_diabetes_final['NTOKENS'][1]) == df_notes_diabetes_final['NTOKENS_LEN'][1], 'length of 2nd sequences does not match, incorrect data'
assert len(df_notes_diabetes_final['NTOKENS'][2]) == df_notes_diabetes_final['NTOKENS_LEN'][2], 'length of 3rd sequences does not match, incorrect data'
assert len(df_notes_diabetes_final['NTOKENS'][399622]) == df_notes_diabetes_final['NTOKENS_LEN'][399622], 'length of 399622 sequences does not match, incorrect data'

In [23]:
notes_tokens_list_file = f'{DATA_DIR}notes_tokens_list.p'

if not os.path.exists(notes_tokens_list_file):
  # write to file
  print('tokens_freq_gt_5 len: ', tokens_freq_gt_5)
  pkl.dump( tokens_freq_gt_5, open(notes_tokens_list_file, "wb" ) )
else:
  tokens_freq_gt_5 = pkl.load(open(notes_tokens_list_file,'rb'))
  print('tokens_freq_gt_5 len: ', len(tokens_freq_gt_5))


tokens_freq_gt_5 len:  53229
