# Using Hybrid Tagger to Tag Medical Entities in Patient Histories

---

Sample notebook for python code to tag medical entities in a block of text. Uses en_core_web_sm model to identify common entities such as numeric and time based values. Then uses a heuristic model with known medical entities from MIMIC III and Medical Net to tag specific medical entities.

In [1]:
# Imports

import pandas as pd
import re
import spacy
from spacy import displacy
import numpy as np
import en_core_web_sm

from section_parse import run

In [2]:
# Focuses on sections of discharge summary named "History of Present Illness:"
title = "HISTORY OF PRESENT ILLNESS:"
medication_sections = run(title)
medication_sections = [i for i in medication_sections if i != "NOT FOUND"]

### Heuristic Medical Entity Tagger:

Using a list of known medical entities finds and returns their locations in a block of text

In [3]:
def in_text(x,text):
    '''Checks whether a particular string (x) is in a given text block returns true if yes and false if no'''
    if f"{x}" in text.lower():
        return True
    else:
        return False
    
def add_ent_matches(text,entities):
    '''Creates a list of all occurences of each string in a list of strings within a block of text'''
    matches = []
    for name,ent_type in entities:
        ent_matches = re.finditer(f"[^a-zA-Z0-9]{name}[^a-zA-Z0-9]" ,text.lower())
        for match in ent_matches:
            matches.append([match.start()+1,match.end()-1,ent_type])
    return matches
    
def add_pattern_matches(text):
    '''Creates a list of all occurences of a given pattern in a text block'''
    matches = []
    patterns = ["[^a-zA-Z]\d+\.\d+[^a-zA-Z\.]","[^a-zA-Z]\d+\-\d+[^a-zA-Z\.]"]
    for pattern in patterns:
        pat_matches = re.finditer(pattern,text)
        for match in pat_matches:
            matches.append([match.start()+1,match.end()-1,'CARDINAL'])
    return matches

def drop_subsets(matches):
    '''Drops string matches that are subsets of other matches'''
    reduced = []
    last_end = 0

    for i in range(len(matches)-1):
        # starts at same place
        case1 = matches[i+1][0]!= matches[i][0]
        
        # starts at different place but ends at same place
        case2 = matches[i][1] > matches[i-1][1]

        if case1:
            if i:
                past_last = matches[i][1] > last_end
                if case2 and past_last:
                    reduced.append(matches[i])
                    last_end = matches[i][1]
            else:
                reduced.append(matches[i])
    return reduced

def get_ent_locs(text,entities):
    '''Returns locations of given entities within a string'''
    ent_matches = add_ent_matches(text,entities)
    pat_matches = add_pattern_matches(text)
    matches = ent_matches+pat_matches
    matches = sorted(matches,key=lambda x: (x[0], x[1]))
    return drop_subsets(matches)

def show_ents(text,entities):
    '''Given location of entities displays text block with entities highlighted'''
    ents = [{"start":x[0],"end":x[1],"label":x[2]} for x in entities]
    ex = [{"text":text,
       "ents":ents}
         ]
    colors = {"DRUG": "rgb(60,180,240)","DOSE":"rgb(240,180,60)","ROUTE":"rgb(200,200,200)"}
    options = {"colors":colors}
    html = displacy.render(ex,style="ent",manual=True,options=options,jupyter=True)
    return

def get_entity_tags(text_block,ent_df):
    '''given a block of text and a dataframe of entities, returns list of entity locations'''
    text = text_block.lower()
    ent_df["in_text"] = ent_df["Name"].apply(lambda x:in_text(x,text))
    entities = ent_df[ent_df["in_text"]][["Name","Entity"]].values
    ent_locs = get_ent_locs(text,entities)
    return ent_locs

def clean_text(text):
    text = re.sub("[:*]"," ",text)
    return text

### Entities.csv:

Combines known medical entities from MedicalNet.com and MIMIC III Dataset.

In [4]:
# Load in entities
ent_df = pd.read_csv("./data/entities.csv")

# Drop some entities that are also common words / incorrect
bad_ents = ["solution","dose","lot","enema","-","in","can","pack","ring","bar","bags","cart","jar","pad","as","it","in"]
ent_df = ent_df[ent_df["Name"].isin(bad_ents)==0]
ent_df.head()

Unnamed: 0,Name,Entity
0,pyridostigmine bromide syrup,DRUG
1,critic-aid clear af,DRUG
2,ibup,DRUG
3,posaconazole oral liquid (*ind*),DRUG
4,byetta,DRUG


### Using en_core_web_sm for Common Date/Numeric Entities and Combining Models:

In [5]:
def add_cust_ents(text,ent_df):
    '''Uses list of medical entities to tag their locations within a block of text.'''
    cust_ents = get_entity_tags(text,ent_df)
    ent_types = ["CONDITION","SYMPTOM","DRUG","UNIT","ROUTE"]
    cust_ents = [[x[0],x[1],x[2]] for x in cust_ents if x[2] in ent_types]
    return cust_ents

def add_doc_ents(text,nlp):
    '''Uses a NER model (nlp) to identify numeric and time based entities in a block of text.'''
    doc = nlp(text)
    ent_types = ["DATE","CARDINAL","QUANTITY","TIME","ORDINAL","PERCENT"]
    doc_ents = [[ent.start_char,ent.end_char,ent.label_] for ent in doc.ents if ent.label_ in ent_types]
    return doc_ents

def get_hybrid_tags(text,ent_df,nlp):
    '''Combines custom medical entity tagger with pretrained NER model to tag medical, numeric and time based entities'''
    cust_ents = add_cust_ents(text,ent_df)
    doc_ents = add_doc_ents(text,nlp)
    ents = sorted(cust_ents+doc_ents,key=lambda x: (x[0], x[1]))
    ents = drop_subsets(ents)
    return ents

def clean_text(text):
    '''Removes line breaks from text for better performance when tagging'''
    text = text.replace("\n"," ")
    return text

In [6]:
nlp = en_core_web_sm.load()
text = medication_sections[2]
cleaned_text = clean_text(text)
ents = get_hybrid_tags(cleaned_text,ent_df,nlp)
show_ents(cleaned_text,ents)

---