# Using Hybrid Tagger to Tag Medical Entities in Patient Histories

In [1]:
%reset -f

In [2]:
import pandas as pd
import re
import spacy
from spacy import displacy
import numpy as np
import en_core_web_sm

from section_parse import run

In [3]:
title = "HISTORY OF PRESENT ILLNESS:"
medication_sections = run(title)
medication_sections = [i for i in medication_sections if i != "NOT FOUND"]

In [4]:
def in_text(x,text):
    '''Checks whether a particular string (x) is in a given text block returns true if yes and false if no'''
    if f"{x}" in text.lower():
        return True
    else:
        return False
    
def add_ent_matches(text,entities):
    '''Creates a list of matches '''
    matches = []
    for name,ent_type in entities:
        ent_matches = re.finditer(f"[^a-zA-Z0-9]{name}[^a-zA-Z0-9]" ,text.lower())
        for match in ent_matches:
            matches.append([match.start()+1,match.end()-1,ent_type])
    return matches
    
def add_pattern_matches(text):
    matches = []
    patterns = ["[^a-zA-Z]\d+\.\d+[^a-zA-Z\.]","[^a-zA-Z]\d+\-\d+[^a-zA-Z\.]"]
    for pattern in patterns:
        pat_matches = re.finditer(pattern,text)
        for match in pat_matches:
            matches.append([match.start()+1,match.end()-1,'CARDINAL'])
    return matches

def drop_subsets(matches):
    reduced = []
    last_end = 0

    for i in range(len(matches)-1):
        # starts at same place
        case1 = matches[i+1][0]!= matches[i][0]
        
        # starts at different place but ends at same place
        case2 = matches[i][1] > matches[i-1][1]

        if case1:
            if i:
                past_last = matches[i][1] > last_end
                if case2 and past_last:
                    reduced.append(matches[i])
                    last_end = matches[i][1]
            else:
                reduced.append(matches[i])
    return reduced

def get_ent_locs(text,entities):
    ent_matches = add_ent_matches(text,entities)
    pat_matches = add_pattern_matches(text)
    matches = ent_matches+pat_matches
    matches = sorted(matches,key=lambda x: (x[0], x[1]))
    return drop_subsets(matches)

def show_ents(text,entities):
    ents = [{"start":x[0],"end":x[1],"label":x[2]} for x in entities]
    ex = [{"text":text,
       "ents":ents}
         ]
    colors = {"DRUG": "rgb(60,180,240)","DOSE":"rgb(240,180,60)","ROUTE":"rgb(200,200,200)"}
    options = {"colors":colors}
    html = displacy.render(ex,style="ent",manual=True,options=options,jupyter=True)
    return

def get_entity_tags(text_block,ent_df):
    text = text_block.lower()
    ent_df["in_text"] = ent_df["Name"].apply(lambda x:in_text(x,text))
    entities = ent_df[ent_df["in_text"]][["Name","Entity"]].values
    ent_locs = get_ent_locs(text,entities)
    return ent_locs

def clean_text(text):
    text = re.sub("[:*]"," ",text)
    return text

In [5]:
ent_df = pd.read_csv("./entities.csv")
ent_df.head()

Unnamed: 0,Name,Entity
0,neo*im*pneumococcal 7-valent,DRUG
1,maxalt,DRUG
2,reopro,DRUG
3,zemplar,DRUG
4,send 500mg vial,DRUG


In [6]:
bad_ents = ["solution","dose","lot","enema","-","in","can","pack","ring","bar","bags","cart","jar","pad","as","it","in"]
ent_df = ent_df[ent_df["Name"].isin(bad_ents)==0]

In [7]:
def add_cust_ents(text,ent_df):
    cust_ents = get_entity_tags(text,ent_df)
    ent_types = ["CONDITION","SYMPTOM","DRUG","UNIT","ROUTE"]
    cust_ents = [[x[0],x[1],x[2]] for x in cust_ents if x[2] in ent_types]
    return cust_ents

def add_doc_ents(text,nlp):
    doc = nlp(text)
    ent_types = ["DATE","CARDINAL","QUANTITY","TIME","ORDINAL","PERCENT"]
    doc_ents = [[ent.start_char,ent.end_char,ent.label_] for ent in doc.ents if ent.label_ in ent_types]
    return doc_ents

In [8]:
def get_hybrid_tags(text,ent_df,nlp):
    cust_ents = add_cust_ents(text,ent_df)
    doc_ents = add_doc_ents(text,nlp)
    ents = sorted(cust_ents+doc_ents,key=lambda x: (x[0], x[1]))
    ents = drop_subsets(ents)
    return ents

In [9]:
def clean_text(text):
    text = text.replace("\n"," ")
    # text = text.replace("*","")
    return text

In [10]:
nlp = en_core_web_sm.load()
text = medication_sections[2]
cleaned_text = clean_text(text)
ents = get_hybrid_tags(cleaned_text,ent_df,nlp)
show_ents(cleaned_text,ents)

---