In [None]:
!pip install pyhealth inflect autocorrect torchtext gensim==3.6.0

In [None]:
import numpy as np
import pandas as pd
from pyhealth.medcode import InnerMap
from pyhealth.datasets import MIMIC4Dataset

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import os
import csv
import pickle
import inflect
from autocorrect import spell
from collections import OrderedDict


import gensim
from gensim.models import Word2Vec
import pickle

import torch
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import statistics
# for progress bar
from tqdm import tqdm_notebook
import random
import json
import tqdm
from sklearn.metrics import *

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
dataset = MIMIC4Dataset(
        root="data/mimic4_subset",
        tables=["diagnoses_icd", "procedures_icd"],
    )
#dataset.stat()
#dataset.info()

In [None]:
icd9cm = InnerMap.load("ICD9CM")
icd10cm = InnerMap.load("ICD10CM")
#smoker = icd9cm.lookup("V15.82")

counter = 0
patient_dict = dataset.patients
labels = []
for subject_id, patient in patient_dict.items():
    #if counter > 100:
    #    break
    #counter += 1
    tobacco = 0
    visit_dict = patient.visits
    for visit_id, visit in visit_dict.items():
        #print(visit.encounter_time, visit.available_tables)
        '''
        find the first occurence of identifying a smoker.  All other visits keep code as a smoker or former smoker, but to find this in the notes
        after spot checking I see it is more often than not only written down as such the first time it is identified 
        '''
        events = visit.get_event_list('diagnoses_icd')
        for event in events:
            if event.vocabulary == 'ICD9CM' and event.code in ['V1582', '3051']:
                tobacco = 1
                break
                #explain = icd9cm.lookup(event.code)
                #print(event.patient_id, event.visit_id, visit.encounter_time, event.vocabulary, event.code, explain)
            elif event.vocabulary == 'ICD10CM' and event.code.startswith('F17'):
                tobacco = 1
                break
                #explain = icd10cm.lookup(event.code)
                #print(event.patient_id, event.visit_id, visit.encounter_time, event.vocabulary, event.code, explain)
        if tobacco == 1:
            break
    labels.append({'subject_id':subject_id,'label':tobacco,'hadm_id':visit_id})

label_df = pd.DataFrame(labels)
label_df["subject_id"] = pd.to_numeric(label_df["subject_id"])
label_df["hadm_id"] = pd.to_numeric(label_df["hadm_id"])

In [None]:
# function that cleans text
# still need to account for contractions, abbreviations, and numbers/fractions
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice
def clean_text(text, replace_numbers = False, remove_rare = False, remove_punctuation = False, stem_text = False, remove_stopwords = False, remove_num = False , spell_check = False, remove_repeat = False):
        def misc_cleaning(text):
                text = re.sub("-([a-zA-Z]+)", r"\1", text) # replaces hyphen with spaces in case of strings
                text = re.sub(' y ', '', text) # gets rid of random y accent stuff scattered through the text
                text = re.sub('yyy', 'y', text)
                text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
                text = re.sub(r"what's", "what is ", text)
                text = re.sub(r"\'s", " ", text)
                text = re.sub(r"\'ve", " have ", text)
                text = re.sub(r"can't", "cannot ", text)
                text = re.sub(r"n't", " not ", text)
                text = re.sub(r"i'm", "i am ", text)
                text = re.sub(r"\'re", " are ", text)
                text = re.sub(r"\'d", " would ", text)
                text = re.sub(r"\'ll", " will ", text)
                text = re.sub(r",", " ", text)
                text = re.sub(r"\.", " ", text)
                text = re.sub(r"!", " ! ", text)
                text = re.sub(r"\/", " ", text)
                text = re.sub(r"\^", " ^ ", text)
                text = re.sub(r"\+", " + ", text)
                text = re.sub(r"\-", " - ", text)
                text = re.sub(r"\=", " = ", text)
                text = re.sub(r"'", " ", text)
                text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
                text = re.sub(r":", " : ", text)
                text = re.sub(r" e g ", " eg ", text)
                text = re.sub(r" b g ", " bg ", text)
                text = re.sub(r" u s ", " american ", text)
                text = re.sub(r"\0s", "0", text)
                text = re.sub(r" 9 11 ", "911", text)
                text = re.sub(r"e - mail", "email", text)
                text = re.sub(r"j k", "jk", text)
                text = re.sub(r"\s{2,}", " ", text)
                return text

        # function to tokenize text which is used in a lot of the later processing
        def tokenize_text(text):
                return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

        text = text.strip(' ') # strip whitespaces
        text = text.lower() # lowercase
        text = misc_cleaning(text) # look at function, random cleaning stuff
        
        if remove_repeat:
                sentences = sent_tokenize(text)
                sentences = list(dict.fromkeys(sentences))
                text = " ".join(sentences)
        
        # removes punctuation
        if remove_punctuation:
                text = text.translate(str.maketrans('', '', string.punctuation))

        # optional: replaces numbers ("3") with their word counterparts ("three")
        if replace_numbers:
                words = word_tokenize(text)
                p = inflect.engine()
                new_words = []
                for word in words:
                        if word.isdigit():
                                new_word = p.number_to_words(word)
                                new_words.append(new_word)
                        else:
                                new_words.append(word)
                text = " ".join(new_words)

        # optional: removes the rarest words in each text --> right now it's 10
        if remove_rare:
                tokens = word_tokenize(text)
                freq_dist = nltk.FreqDist(tokens)
                rarewords = list(freq_dist.keys())[-10:]
                new_words = [word for word in tokens if word not in rarewords]
                text = " ".join(new_words)

        # optional: stems text using Porter Stemmer
        if stem_text:
                stemmer = default_stemmer
                tokens = tokenize_text(text)
                text = " ".join([stemmer.stem(t) for t in tokens])

        # removes stop words such as "a", "the", etc.
        if remove_stopwords:
                stop_words = default_stopwords
                tokens = [w for w in tokenize_text(text) if w not in stop_words]
                text = " ".join(tokens)
        
        # optional: removes numbers completely from the ext
        if remove_num:
                text=text.split()
                text=[x for x in text if not x.isnumeric()]
                text= " ".join(text)
        
        #remove headers from discharge notes
        #name unit admission date discharge date date birth sex service medicine allergies known allergies adverse drug reactions attending chief complaint 
        headers = text.find(" chief complaint ",1,300)
        if headers > -1:
            headers += 17
            text = text[headers:]
        return text


In [None]:
df_notes_discharge = pd.read_csv("data/mimic4_notes/discharge.csv")
#df_notes_discharge_detail = pd.read_csv("data/mimic4_notes/discharge_detail.csv")
#df_notes_radiology = pd.read_csv("data/mimic4_notes/radiology.csv")

print(df_notes_discharge.columns)
print('total len', len(df_notes_discharge))
#df_notes_discharge_detail.columns

#print(df_notes_discharge['subject_id'])
#print(df_notes_radiology['text'][0])

#group by patient and concatenate all notes for one patient
#df_notes_discharge = df_notes_discharge.groupby(['subject_id'], as_index = False).agg({'text': ' '.join})


print(df_notes_discharge.columns)
print('len of patients', len(df_notes_discharge))

#trim down patients
df_notes_discharge = df_notes_discharge.head(5000)
print('final len', len(df_notes_discharge))

In [None]:
import dask.dataframe as dd
from dask.multiprocessing import get

#ddf = dd.from_pandas(df_notes_discharge, npartitions=7)
#meta_df = pd.DataFrame(columns=["subject_id", "text", "new_text"], dtype=object)

#ddf['text'] = ddf['text'].apply(lambda text: clean_text(text, remove_punctuation = True, remove_stopwords = True, remove_repeat = True))

#res = ddf.map_partitions(lambda df: df.assign(new_text = clean_text(df['text'], remove_punctuation = True, remove_stopwords = True, remove_repeat = True)), meta=meta_df)
#res.to_csv("data/mimic4_notes/discharge_clean.csv", index=False)

#pandas_df = ddf.compute()
#pandas_df.to_csv("data/mimic4_notes/discharge_clean.csv", index=False)

   
# save labels of same size and order
# return just the labels of the patients in the correct order as y, order of left table is maintained.
#labels = df_notes_discharge.merge(label_df, on='subject_id', how='inner')['label']
out_df = df_notes_discharge.merge(label_df, on=['subject_id','hadm_id'], how='inner')[['subject_id','hadm_id','label','text']]

#limit to 781_unclean patients who have both discharge notes and codes for the same visit
out_df = out_df.head(781)

#HERE IS ABLATION OF CLEANING TEXT
#out_df['text'] = out_df['text'].apply(lambda text: clean_text(text, remove_punctuation = True, remove_stopwords = True, remove_repeat = True, remove_num = True))

labels_out = out_df['label'].to_numpy()
#save notes for embeddings
notes = list(out_df['text'])  

In [None]:
#TESTING
#tables=["diagnoses_icd", "procedures_icd"],
pd.set_option('display.max_colwidth', None)

print(len(labels_out))
print(sum(labels_out))
print('positivity rate', sum(labels_out) / len(labels_out))

print(labels[0])
print(icd9cm.lookup('V1582'))

find_pat = pd.read_csv("data/mimic4_subset/diagnoses_icd.csv")
print(find_pat.columns)
out = find_pat.loc[(find_pat['subject_id'] == 10000032) & (find_pat['hadm_id'] == 22595853)]
print(out)

print(df_notes_discharge.columns)
print(label_df.columns)

out = out_df.loc[(out_df['subject_id'] == 10000032) & (out_df['hadm_id'] == 22595853)]
out

In [None]:
# process notes converted to index array of numbers of same length

# transforms text to a sequence of integers padded to same length
#from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences

def textTokenize(notes):
    """For each patients text, find max length, build a dict of words
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of tokens
        word2idx (Dict): Dict built from the corpus
        max_len (int): max sentence length
    """
    t = get_tokenizer("basic_english")
    lengths = []
    tokenized_texts = []
    word2idx = {}
    # Add padding and unknown tokens to the dictionary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    idx = 2
    for text in notes:
        tokenized_text = t(text)
        tokenized_texts.append(tokenized_text)
        # Add new token to `word2idx`
        for token in tokenized_text:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1
        lengths.append(len(tokenized_text))
    mean_len = np.mean(lengths)
    std_len = np.std(lengths)
    max_len = np.max(lengths)
    return tokenized_texts, word2idx, max_len, mean_len, std_len

def encodeTokenizedText(tokenized_texts, word2idx, max_len):
    """Pad each sentence to the max length and encode tokens to their index in the all words dict.
    Make it more efficient -  instead of max length, make it mean len + 4x std dev, to eliminate few outliers

    Returns:
        input_ids (np.array): Array of token indexes in the vocabulary with
            shape (N, max_len). It will the input to the CNN.
    """
    input_ids = []
    rev_input_ids = []
    masks = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        if (len(tokenized_sent) > max_len):
            tokenized_sent = tokenized_sent[0:max_len]
            mask = np.ones(len(tokenized_sent))
        else:
            
            #use pre-padding
            #tokenized_sent = ['<pad>'] * (max_len - len(tokenized_sent)) + tokenized_sent
            #use post-padding ( will mask for rnn and bi directional rnn !)
            pad = max_len - len(tokenized_sent)
            mask = np.concatenate([np.ones(len(tokenized_sent)), np.zeros(pad)])
            rev_tokenized_sent = tokenized_sent.copy()
            rev_tokenized_sent.reverse()
            tokenized_sent += ['<pad>'] * pad
            rev_tokenized_sent += ['<pad>'] * pad
            
        if len(tokenized_sent) != max_len or len(mask) != max_len or len(rev_tokenized_sent) != max_len:
            print(max_len, len(tokenized_sent), len(mask))
            break
            
        # Encode tokens to input_ids, input_id is just the idx position when it was inserted, so it converts words to numbers
        input_id = [word2idx.get(token) for token in tokenized_sent]
        rev_input_id = [word2idx.get(token) for token in rev_tokenized_sent]
        input_ids.append(input_id)
        rev_input_ids.append(rev_input_id)
        # Masks of same size to track masking
        masks.append(mask)
        
    return np.array(input_ids, dtype=int), np.array(rev_input_ids, dtype=int), np.array(masks, dtype=int)


tokenized_texts, word2idx, max_len, mean_len, std_len = textTokenize(notes)
normal_max_len = int((mean_len + 4*std_len) + 1)
max_len = normal_max_len

# input_ids are the input to cnn and rnn models, as the tokenized text
input_ids, rev_input_ids, masks = encodeTokenizedText(tokenized_texts, word2idx, max_len)

In [None]:
print(len(input_ids), len(rev_input_ids), len(masks))

In [None]:
# MAKE EMBEDDING MATRIX
import gensim.downloader as api

#pretrain = api.load('word2vec-google-news-300')

with open('data/embeddings/pretrain.pckl', 'rb') as f:
    pretrain = pickle.load(f)

# Make Word2Vec embeddings from the notes themselves
def make_w2v_model(notes, window, workers, epochs, vector_size, min_count):
    model = gensim.models.Word2Vec(notes, size=vector_size, window=window, min_count=min_count, workers=workers)
    print('Start training process...') 
    model.train(notes,total_examples=len(notes),epochs=epochs)
    model.save("w2v.model")
    print("Model Saved")

make_w2v_model(notes,  window=5, workers=1, epochs=20, vector_size=300, min_count=2)

def word_Embed_w2v(word_index, model):   
    w2v = model
    #convert pretrained word embedding to a dictionary
    embedding_index=dict()
    print('word vectors len is ',len(w2v.wv.vocab))
    for i in range(len(w2v.wv.vocab)):
        word=w2v.wv.index2word[i]
        if word is not None:
            embedding_index[word]=w2v.wv[word]  
    #extract word embedding for train and test data
    
    # create matrix of shape
    embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), 300))    
    embedding_matrix[word_index['<pad>']] = np.zeros((300,))

    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def word_Embed_GNV(word_index):   
    """ Load the pretrained vectors for each token in our vocabulary. 
    For tokens with no pretraiend vectors, we will initialize random word vectors with the same length and variance.
    
     Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """
    #pretrain = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
    # convert pretrained word embedding to a dictionary
    print('pretrain len is ',len(pretrain.wv.vocab))
    # fill embedding_index with every word from the pretrain
    embedding_index=dict()
    for i in range(len(pretrain.wv.vocab)):
        word=pretrain.wv.index2word[i]
        if word is not None:
            embedding_index[word]=pretrain.wv[word] 
            
    # create matrix of shape
    embedding_matrix = np.random.uniform(-0.25, 0.25, (len(word_index), 300))    
    embedding_matrix[word_index['<pad>']] = np.zeros((300,))
    
    for word, i in tqdm_notebook(word_index.items()):
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

w2v_model = Word2Vec.load("w2v.model")
embedding_matrix_w2v = word_Embed_w2v(word2idx, w2v_model)

embedding_matrix_GNV = word_Embed_GNV(word2idx)

In [None]:
#dump encoded notes and embeddings
with open("data/781_unclean/labels.pckl", "wb") as f:
    pickle.dump(labels_out, f)
print("Saved labels")

# save cleaned notes into a pickle file
with open('data/781_unclean/cleaned_notes.pckl', 'wb') as f:
    pickle.dump(notes, f)
print("Saved cleansed notes")

with open('data/781_unclean/df_notes_discharge.pckl', 'wb') as f:
    pickle.dump(df_notes_discharge, f)
print("Saved cleansed df_notes_discharge ")

with open('data/781_unclean/embeddings/pretrain.pckl', 'wb') as f:
    pickle.dump(pretrain, f)
print("Saved pretrain")

with open('data/781_unclean/embeddings/tokenized_notes.pckl', 'wb') as f:
    pickle.dump(input_ids, f)
print("Saved Tokenized Notes")

with open('data/781_unclean/embeddings/rev_tokenized_notes.pckl', 'wb') as f:
    pickle.dump(rev_input_ids, f)
print("Saved Reverse Tokenized Notes")

with open('data/781_unclean/embeddings/masks.pckl', 'wb') as f:
    pickle.dump(masks, f)
print("Saved Masks")

with open('data/781_unclean/embeddings/embedding_matrix_GNV.pckl', 'wb') as f:
    pickle.dump(embedding_matrix_GNV, f)
print("Saved Google Vector Word Embedding Matrix")

with open('data/781_unclean/embeddings/embedding_matrix_w2v.pckl', 'wb') as f:
    pickle.dump(embedding_matrix_w2v, f)
print("Saved Word 2 Vector Embedding Matrix")

with open('data/781_unclean/embeddings/word_index_eff.pckl', 'wb') as f:
    pickle.dump(word2idx, f)
print("Saved Word Indices")

with open('data/781_unclean/embeddings/max_len_eff.pckl', 'wb') as f:
    pickle.dump(max_len, f)
print("Saved Maximum Length of One Patient's Notes")

In [None]:
sum(input_ids[0])

In [None]:
sum(rev_input_ids[0])

In [None]:
with open('data/781_unclean/embeddings/test_input.json', 'w') as f:
    json.dump(input_ids.tolist(), f)

In [None]:
print('done')