# Formatting Text to Vectors

Part of future work on developing models to predict medical entities in sentances. Final X and y tensors can be used to train sequence-to-sequence models for NER.

In [1]:
import nltk
import pandas as pd
import numpy as np
from section_parse import run
from nltk import RegexpChunkParser,word_tokenize
from nltk.chunk.regexp import ChunkRule,ChinkRule,SplitRule,MergeRule
import re

### Loading in Discharge Summary Sections

In [2]:
title = "HISTORY OF PRESENT ILLNESS:"
#title = "DISCHARGE MEDICATIONS:"
medication_sections = run(title)
medication_sections = [i for i in medication_sections if i != "NOT FOUND"]

## Text Cleaning Funciton

In [3]:
def clean_text(text):
    bad_chars = [":","*"]
    space_chars = ["[","]","(",")"]
    for c in bad_chars:
        text = text.replace(c,"")
    for c in space_chars:
        text = text.replace(c," ")
    return text.lower()

In [4]:
print(clean_text(medication_sections[6]))

history of present illness
 known firstname    known lastname 1852  is a 62-year-old left-handed man who is here for a
follow up of his left sphenoid meningioma.  i last saw him on
 2149-11-17  and his head ct showed growth of the left sphenoid
meningioma.  he is seizure free.  today, he is here with his
wife
and daughter.   name  ni    does not have headache, nausea, vomiting,
urinary incontinence, or fall.

his neurological problem began on  2142-6-22  when he became
confused and disoriented in a hotel bathroom.  at that time, he
was visiting his daughter for a wedding.  his wife found him
slumped over in the bath tube.  according to her, his eyes
looked
funny.  he could not stand up.  his verbal output did not make
sense.  he was brought to  doctor first name 1853  hospital in placentia,
ca.  he woke up 7 to 8 hours later in the emergency room.  he
felt very tired after the event.  he was hospitalized from
 2142-6-22  to  2142-6-25 .  he had a cardiac pacemaker placement due
to irre

## Loading Entity Dataframe

In [5]:
ent_df = pd.read_csv("./data/entities.csv")
bad_ents = ["solution","dose","lot","enema","-","in","can","pack","ring","bar","bags","cart","jar","pad","as","it","in"]
ent_df = ent_df[ent_df["Name"].isin(bad_ents)==0].copy()
section_mask = ent_df["Entity"].isin(["DRUG","ROUTE","UNIT","CONDITION","SYMPTOM","DOSE"])
ent_df = ent_df[section_mask].dropna()
ent_df.head()

Unnamed: 0,Name,Entity
0,pyridostigmine bromide syrup,DRUG
1,critic-aid clear af,DRUG
2,ibup,DRUG
3,posaconazole oral liquid (*ind*),DRUG
4,byetta,DRUG


# Formatting Data for ML Models

Steps:

1. Convert list of sections to list of sentances
2. Tokenize sentances and get POS for each word
3. Parse certain tokens into chunks
4. Get medical entity tag for token / chunks from entity data frame
5. Output as list of sequences of words,POS,and labels.
6. Format sequences by padding to equal lengths for models

In [6]:
def search(ent_name):
    return ent_df[ent_df["Name"]==ent_name]

def sections_to_sentances(sections):
    sentances = []
    for section in sections:
        section = clean_text(section)
        section_sentances = section.split(".")
        sentances += [i for i in section_sentances if len(i)>0]
    return sentances

def parse_pos(text):
    text = word_tokenize(text)
    pos = nltk.pos_tag(text)
    return pos

def parse_chunks(pos):
    cr1 = ChunkRule("<NN><IN><NN>+","Chunk Some Stuff")
    cr2 = ChunkRule("<NN><NN>","chunk noun pairs")
    cr3 = ChunkRule("<NN><NNS>","chunk noun and nns pairs")
    cr4 = ChunkRule("<JJ><NNS>","chunk other stuff")
    cr5 = ChunkRule("<JJ>""<JJ>","yet more chunks")
    chunk_parser = RegexpChunkParser([cr1,cr2,cr3,cr4,cr5],chunk_label="NP")
    chunked_text = chunk_parser.parse(pos)
    return chunked_text

def format_chunks(chunks):
    formatted_chunks = []
    for chunk in chunks:
        if type(chunk) != tuple:
            chunk = (' '.join([i[0] for i in chunk.leaves()]),'NP')
        formatted_chunks.append(chunk)
    return formatted_chunks
    
def return_chunk_ent_type(name,ent_df):
    mask = ent_df["Name"] == name
    if sum(mask) > 0:
        return ent_df[mask]["Entity"].iloc[0]
    
    elif len(name.split())>1:
        for word in name.split():
            mask = ent_df["Name"]==word
            if sum(mask) > 0:
                return ent_df[mask]["Entity"].iloc[0]
    return 'O'
    
def medical_chunker(text,ent_df=ent_df):
    pos = parse_pos(text)
    chunked_text = parse_chunks(pos)
    chunks = format_chunks(chunked_text)
    chunk_df = pd.DataFrame(data=chunks,columns=["Name",'POS'])
    
    # Tag single words
    chunk_df["TAG"] = chunk_df["Name"].apply(lambda x:return_chunk_ent_type(x,ent_df))
    return chunk_df.values

def create_dataset(sections,ent_df):
    
    sentances = sections_to_sentances(sections)
    dataset = []
    
    # create a df for each sentance and combine
    for i,sentance in enumerate(sentances):
        sequence = medical_chunker(sentance,ent_df=ent_df)
        dataset.append(sequence)
        
    return dataset

In [7]:
sections = medication_sections[:100]
seqs = create_dataset(sections,ent_df)
seqs = [i for i in seqs if len(i)>0]



# Formatting Words -> Numberic

Words are now formatted as sequences for each sentance with corresponding POS and entity tags. Words are now converted to number values / ids and padded to the appropriate length.

In [8]:
print("max sequence length:",max([len(i) for i in seqs]),"\n")
print("Sample:")
print(seqs[0])

max sequence length: 98 

Sample:
[['history' 'NN' 'O']
 ['of' 'IN' 'O']
 ['present' 'JJ' 'O']
 ['illness' 'NN' 'O']
 ['this' 'DT' 'O']
 ['is' 'VBZ' 'O']
 ['an' 'DT' 'O']
 ['81-year-old' 'JJ' 'O']
 ['female' 'NN' 'O']
 ['with' 'IN' 'O']
 ['a' 'DT' 'O']
 ['history of emphysema' 'NP' 'CONDITION']
 ['not' 'RB' 'O']
 ['on' 'IN' 'O']
 ['home o2' 'NP' 'UNIT']
 [',' ',' 'O']
 ['who' 'WP' 'O']
 ['presents' 'VBZ' 'O']
 ['with' 'IN' 'O']
 ['three' 'CD' 'O']
 ['days' 'NNS' 'O']
 ['of' 'IN' 'O']
 ['shortness of breath' 'NP' 'SYMPTOM']
 ['thought' 'VBN' 'O']
 ['by' 'IN' 'O']
 ['her' 'PRP$' 'O']
 ['primary' 'JJ' 'O']
 ['care doctor' 'NP' 'O']
 ['to' 'TO' 'O']
 ['be' 'VB' 'O']
 ['a' 'DT' 'O']
 ['copd flare' 'NP' 'CONDITION']]


In [9]:
def get_word_ids(sentances,feature = 0):
    words = []
    for sentance in sentances:
        words += list([word[feature] for word in sentance])
    word_dict = {word:i for i,word in enumerate(set(words))}
    return word_dict

In [10]:
word_ids = get_word_ids(seqs,0)
pos_ids = get_word_ids(seqs,1)
tag_ids = get_word_ids(seqs,2)

In [11]:
def words_to_ids(sentances,word_ids,tag_ids,pos_ids):
    vector = []
    for sentance in sentances:
        vector.append(list([[word_ids[w[0]],pos_ids[w[1]],tag_ids[w[2]]] for w in sentance]))
    return np.array(vector)

In [12]:
vectors = words_to_ids(seqs,word_ids,tag_ids,pos_ids)
print("Numeric Representation:")
print(vectors[0][:4])
print('')
print("Word Representation:")
print(seqs[0][:4])

Numeric Representation:
[[515, 8, 2], [740, 3, 2], [1836, 14, 2], [2035, 8, 2]]

Word Representation:
[['history' 'NN' 'O']
 ['of' 'IN' 'O']
 ['present' 'JJ' 'O']
 ['illness' 'NN' 'O']]


# Padding Sequences and Creating X,Y variables

In [13]:
def pad_sequences(vectors,length):
    matrix = []
    for v in vectors:
        pad_length = length-len(v)
        fill = np.zeros([pad_length,3])
        padded_seq = np.vstack([v,fill])
        matrix.append(padded_seq)
    return np.array(matrix)

In [14]:
max_length = max([len(i) for i in vectors])
matrix = pad_sequences(vectors,max_length)
print(matrix.shape)

(1640, 98, 3)


In [15]:
print(matrix[0][:5],"\n...\n",matrix[0][-5:])

[[5.150e+02 8.000e+00 2.000e+00]
 [7.400e+02 3.000e+00 2.000e+00]
 [1.836e+03 1.400e+01 2.000e+00]
 [2.035e+03 8.000e+00 2.000e+00]
 [1.615e+03 2.800e+01 2.000e+00]] 
...
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [16]:
def create_x_y(matrix):
    x = []
    y = []
    for sequences in matrix:
        xi = [[i[0],i[1]] for i in sequences]
        yi = [[i[2] for i in sequences]]
        x.append(xi)
        y.append(yi)
    return np.array(x),np.array(y)

In [19]:
x,y = create_x_y(matrix)
y = np.reshape(y,[-1,y.shape[-1]])
print("X-shape:",x.shape)
print(x[0][:5])
print('')
print("Y-shape:",y.shape)
print(y[0][:5])

X-shape: (1640, 98, 2)
[[ 515.    8.]
 [ 740.    3.]
 [1836.   14.]
 [2035.    8.]
 [1615.   28.]]

Y-shape: (1640, 98)
[2. 2. 2. 2. 2.]


---