In [1]:
import pandas as pd
import re
import spacy
import numpy as np

In [2]:
from section_parse import run

In [3]:
title = "DISCHARGE MEDICATIONS:"
medication_sections = run(title)
medication_sections = [i for i in medication_sections if i != "NOT FOUND"]

In [4]:
print(medication_sections[0])

DISCHARGE MEDICATIONS:
1. Levothyroxine 75 mcg p.o. q.d.
2. Citalopram 10 mg p.o. q.d.
3. Aspirin 81 mg p.o. q.d.
4. Fluticasone 110 mcg two puffs inhaled b.i.d.
5. Salmeterol Diskus one inhalation b.i.d.
6. Acetaminophen 325-650 mg p.o. q.4-6h. prn.
7. Ipratropium bromide MDI two puffs inhaled q.2h. prn.
8. Albuterol 1-2 puffs inhaled q.2h. prn.
9. Zolpidem tartrate 5 mg p.o. q.h.s. prn.
10. Isosorbide dinitrate 10 mg p.o. t.i.d.
11. Diltiazem 60 mg p.o. q.i.d.
12. Pantoprazole 40 mg p.o. q.24h.
13. Trazodone 25 mg p.o. q.h.s. prn.
14. SubQ Heparin 5000 units subcutaneous b.i.d. until such
time that the patient is able to get out of bed twice a day.
15. Cepacol lozenges q.2h. prn.
16. Levofloxacin 500 mg p.o. q.d. for a seven day course to
be completed on [**2118-6-21**].
17. Kaopectate/Benadryl/lidocaine 5 mL p.o. b.i.d. prn, not
to be given around mealtimes for concern of dysphagia induced
by lidocaine.
18. Lorazepam 0.5-2 mg IV q.6h. prn.


# Use a pre-trained model

In [5]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [6]:
print(medication_sections[0])

DISCHARGE MEDICATIONS:
1. Levothyroxine 75 mcg p.o. q.d.
2. Citalopram 10 mg p.o. q.d.
3. Aspirin 81 mg p.o. q.d.
4. Fluticasone 110 mcg two puffs inhaled b.i.d.
5. Salmeterol Diskus one inhalation b.i.d.
6. Acetaminophen 325-650 mg p.o. q.4-6h. prn.
7. Ipratropium bromide MDI two puffs inhaled q.2h. prn.
8. Albuterol 1-2 puffs inhaled q.2h. prn.
9. Zolpidem tartrate 5 mg p.o. q.h.s. prn.
10. Isosorbide dinitrate 10 mg p.o. t.i.d.
11. Diltiazem 60 mg p.o. q.i.d.
12. Pantoprazole 40 mg p.o. q.24h.
13. Trazodone 25 mg p.o. q.h.s. prn.
14. SubQ Heparin 5000 units subcutaneous b.i.d. until such
time that the patient is able to get out of bed twice a day.
15. Cepacol lozenges q.2h. prn.
16. Levofloxacin 500 mg p.o. q.d. for a seven day course to
be completed on [**2118-6-21**].
17. Kaopectate/Benadryl/lidocaine 5 mL p.o. b.i.d. prn, not
to be given around mealtimes for concern of dysphagia induced
by lidocaine.
18. Lorazepam 0.5-2 mg IV q.6h. prn.


### Tags:

**NOMINAL** - position in a list (eg 1., 2., 3.,...)

**CARDINAL** - The amount of something (medicine entity,number,quantities)

**MEDICINE ENTITY** - Medication names etc

In [7]:
doc = nlp(medication_sections[0])
displacy.render(doc,style="ent",jupyter=True)

Pre-trained model does reasonable for things like numbers and dates but doesn't pick up on medication entities

# Custom Tagging

In [8]:
# Create a custom NER Dataset for medication
def clean_string(string):
    # remove special characters with space
    for char in ['(',')',".",",","[","]"]:
        string = string.replace(char,'')
        
    string = re.sub(r"\n[0-9].",' ',string)
    # replace linebreaks
    for char in ["\n",'*',":"]:
        string = string.replace(char,' ')
    string = string.strip()
    
    # lowercase
    string = string.lower()
    return string

def get_word_lists(sections):
    word_list = [clean_string(i).split(' ') for i in sections]
    return word_list

def get_vocab(sections):
    word_list = get_word_lists(sections)
    vocab = []
    for lists in word_list:
        vocab += lists
    return vocab

In [9]:
def get_counts(titles):
    '''gets counts of section titles'''
    counts = {}
    for title in titles:
        if title in counts:
            counts[title] += 1
        else:
            counts[title] = 1
    return counts

def format_counts(count_dict):
    '''formats section titles counts as dataframe'''
    counts = pd.DataFrame()
    counts['Title'] = count_dict.keys()
    counts['Counts'] = count_dict.values()
    counts = counts.sort_values('Counts',ascending=False)
    return counts

In [10]:
vocab = get_vocab(medication_sections)
counts = get_counts(vocab)
counts = format_counts(counts)
counts.head(20)

Unnamed: 0,Title,Counts
2,,1621526
124,tablet,455238
6,po,344239
10,mg,343473
99,sig,329496
100,1,279638
21,one,254748
152,daily,210411
68,day,169965
115,2,163826


In [11]:
word_list = get_word_lists(medication_sections[:5])
print(medication_sections[0])
print("-"*40)
print(word_list[0])

DISCHARGE MEDICATIONS:
1. Levothyroxine 75 mcg p.o. q.d.
2. Citalopram 10 mg p.o. q.d.
3. Aspirin 81 mg p.o. q.d.
4. Fluticasone 110 mcg two puffs inhaled b.i.d.
5. Salmeterol Diskus one inhalation b.i.d.
6. Acetaminophen 325-650 mg p.o. q.4-6h. prn.
7. Ipratropium bromide MDI two puffs inhaled q.2h. prn.
8. Albuterol 1-2 puffs inhaled q.2h. prn.
9. Zolpidem tartrate 5 mg p.o. q.h.s. prn.
10. Isosorbide dinitrate 10 mg p.o. t.i.d.
11. Diltiazem 60 mg p.o. q.i.d.
12. Pantoprazole 40 mg p.o. q.24h.
13. Trazodone 25 mg p.o. q.h.s. prn.
14. SubQ Heparin 5000 units subcutaneous b.i.d. until such
time that the patient is able to get out of bed twice a day.
15. Cepacol lozenges q.2h. prn.
16. Levofloxacin 500 mg p.o. q.d. for a seven day course to
be completed on [**2118-6-21**].
17. Kaopectate/Benadryl/lidocaine 5 mL p.o. b.i.d. prn, not
to be given around mealtimes for concern of dysphagia induced
by lidocaine.
18. Lorazepam 0.5-2 mg IV q.6h. prn.
----------------------------------------
['

## Known Drugs, Dose, and Routes from Prescriptions.csv

#### Tag Known Drug Names

In [12]:
def load_drug_entities():
    drugs = np.load("./drug_entities.npy")
    drugs = [d.lower() for d in drugs if (len(d)>4)]
    non_ents = ["solution"]
    for ent in non_ents:
        drugs.remove(ent)
    drugs = list(set(drugs))
    #drugs = add_single_words(drugs,l=4)
    return drugs

def add_single_words(ents,l=4):
    a = []
    for i in ents:
        a += i.split()
    ents+=a   
    return [i for i in ents if len(i)>l]

In [13]:
drugs = load_drug_entities()
drugs[-10:]

['sulfa',
 'trifluoperazine hcl',
 'tenofovir disoproxil (viread)',
 'bisopro',
 'pilocarpine hcl',
 '*nf* bisoprolol fumarate',
 'benazepril hcl',
 'carvedilol suspension',
 'formoterol fumarate',
 'levoxyl']

In [14]:
def tag_entities(text,drugs):
    tags = []
    for drug in drugs:
        if f" {drug} " in text.lower():
            tags.append((drug,'DRUG'))
    return tags

def get_locations(text,names):
    matches = []
    for name in names:
        drug_matches = re.finditer(name,text)
        for match in drug_matches:
            matches.append([match.start(),match.end(),name])
    return matches

def show_ents(text,entities,title="DRUG"):
    tags = tag_entities(text,entities)
    names = [i[0] for i in tags]
    matches = get_locations(text.lower(),names)
    matches = sorted(matches,key=lambda x: (x[0], x[1]))
    matches = drop_subsets(matches)
    ents = [{"start":x[0],"end":x[1],"label":title} for x in matches]
    ex = [{"text":text,
       "ents":ents,
       "title":None}]
    html = displacy.render(ex,style="ent",manual=True)
    return matches

def drop_subsets(matches):
    reduced = []
    last_end = 0

    for i in range(len(matches)-1):
        # starts at same place
        case1 = matches[i+1][0]!= matches[i][0]
        
        # starts at different place but ends at same place
        case2 = matches[i][1] > matches[i-1][1]

        if case1:
            if i:
                past_last = matches[i][1] > last_end
                if case2 and past_last:
                    reduced.append(matches[i])
                    last_end = matches[i][1]
            else:
                reduced.append(matches[i])
    return reduced

In [15]:
drugs = load_drug_entities()
print(drugs[:10])
text = medication_sections[6]
drug_matches = show_ents(text,drugs)

['acetaminophen', 'darifenacin', 'penicillamine', 'betimol', 'vinblastine sulfate', 'corzide', 'abreva', 'enalaprilat', 'megace', 'femhrt 1/5']


In [16]:
def load_dose_entities():
    doses = np.load("./dose_entities.npy")
    doses = [d.lower() for d in doses if len(d)>1]
    return list(set(doses))

In [17]:
dose_ents = load_dose_entities()
print(dose_ents[:10])

['0.1%;15g tube', '60mg/ml syringe', '16mg tab', '1gm tab ppk', 'iodoquinol-hc 1%', '50mg tab ppk', '900 mg premix bag', '0.5 % opht drps', '5 mg/10 ml brand name', '20 mg tablet']


In [18]:
def load_route_entities():
    routes = np.load("./route_entities.npy")
    routes = [d.lower() for d in routes if d.lower()!='as']
    routes = set(routes)
    return list(routes)

In [19]:
route_ents = load_route_entities()
print(route_ents[:10])

['ij', 'nan', 'po/pr', 'iv bolus', 'irr', 'scpca', 'ivs', 'im', 'pb', 'in']


In [20]:
print(route_ents[:10])

['ij', 'nan', 'po/pr', 'iv bolus', 'irr', 'scpca', 'ivs', 'im', 'pb', 'in']


In [21]:
route_matches = show_ents(text,route_ents,title="ROUTE")

#### Tagging All Entities

In [22]:
def get_ents(text,entities,title="DRUG"):
    tags = tag_entities(text,entities)
    names = [i[0] for i in tags]
    matches = get_locations(text.lower(),names)
    matches = sorted(matches,key=lambda x: (x[0], x[1]))
    matches = drop_subsets(matches)
    return [[x[0],x[1],title] for x in matches]

In [23]:
def get_all_tags(text):
    drug_tags = get_ents(text,drugs,title="DRUG")
    dose_tags = get_ents(text,dose_ents,title="DOSE")
    route_tags = get_ents(text,route_ents,title="ROUTE")
    return sorted(drug_tags+dose_tags+route_tags)

In [24]:
def show_all_ents(text):
    tags = get_all_tags(text)
    ents = [{"start":x[0],"end":x[1],"label":x[2]} for x in tags]
    ex = [{"text":text,
       "ents":ents,
       "title":None},
         ]
    colors = {"DRUG": "rgb(60,180,240)","DOSE":"rgb(240,180,60)","ROUTE":"rgb(200,200,200)"}
    options = {"colors":colors}
    html = displacy.render(ex,style="ent",manual=True,options=options,jupyter=True)
    return

In [25]:
def load_drug_entities():
    drugs = np.load("./drug_entities.npy")
    drugs = [d.lower() for d in drugs if (len(d)>4)]
    non_ents = ["solution"]
    for ent in non_ents:
        drugs.remove(ent)
    drugs = list(set(drugs))
    #drugs = add_single_words(drugs,l=4)
    return drugs

def load_dose_entities():
    doses = np.load("./dose_entities.npy")
    doses = [d.lower() for d in doses if len(d)>1]
    return list(set(doses))

def load_unit_entities():
    ents = np.load("./unit_entities.npy")
    ents = [ent.lower() for ent in ents]
    return list(set(ents))

def load_route_entities():
    routes = np.load("./route_entities.npy")
    routes = [d.lower() for d in routes if d.lower()!='as']
    routes = set(routes)
    return list(routes)

In [41]:
def create_ent_df():
    drugs = pd.DataFrame()
    drugs["Name"] = load_drug_entities()
    drugs["Entity"] = 'DRUG'
    
    doses = pd.DataFrame()
    doses["Name"] = load_dose_entities()
    doses["Entity"] = 'DOSE'
    
    routes = pd.DataFrame()
    routes["Name"] = load_route_entities()
    routes["Entity"] = 'ROUTE'
    
    unit = pd.DataFrame()
    unit["Name"] = load_unit_entities()
    unit["Entity"] = 'UNIT'
    
    other_ents = pd.DataFrame()
    names = [
            "a day",
            "daily",
            "hours",
            "hr",
            "every",
            "as needed",
            "delayed release",
            "extended release",
            "sustained release",
            "refills",
            "disp",
             ]
    numbers = [str(i) for i in range(100)]
    numbers = numbers+["1-2","1-3","1-4","2-4"]
    
    other_ents["Name"] = names+numbers                    
    other_ents["Entity"] = ["Frequency"]*6+["ROUTE"]*2+["DOSE"]*3+["CARDINAL"]*104
    
    df = pd.concat([drugs,unit,doses,routes,other_ents],axis=0)
    
    return df

In [46]:
def in_text(x,text):
    if f"{x}" in text.lower():
        return True
    else:
        return False
    
def add_ent_matches(text,entities):
        matches = []
        for name,ent_type in entities:
            ent_matches = re.finditer(f"[^a-zA-Z0-9\.]{name}[^a-zA-Z0-9\.]" ,text.lower())
            for match in ent_matches:
                matches.append([match.start()+1,match.end()-1,ent_type])
        return matches
    
def add_pattern_matches(text):
    matches = []
    patterns = ["[^a-zA-Z]\d+\.\d+[^a-zA-Z\.]","[^a-zA-Z]\d+\-\d+[^a-zA-Z\.]"]
    for pattern in patterns:
        pat_matches = re.finditer(pattern,text.lower())
        for match in pat_matches:
            matches.append([match.start()+1,match.end()-1,'CARDINAL'])
    ord_matches = re.finditer(r"\n\d+\.",text.lower())
    for match in ord_matches:
        matches.append([match.start()+1,match.end()-1,'ORDINAL'])
    return matches

def get_ent_locs(text,entities):
    ent_matches = add_ent_matches(text,entities)
    pat_matches = add_pattern_matches(text)
    matches = ent_matches+pat_matches
    matches = sorted(matches,key=lambda x: (x[0], x[1]))
    print(matches[0])
    return drop_subsets(matches)

def show_ents(text,entities):
    ents = [{"start":x[0],"end":x[1],"label":x[2]} for x in entities]
    ex = [{"text":text,
       "ents":ents}
         ]
    colors = {"DRUG": "rgb(60,180,240)","DOSE":"rgb(240,180,60)","ROUTE":"rgb(200,200,200)"}
    options = {"colors":colors}
    html = displacy.render(ex,style="ent",manual=True,options=options,jupyter=True)
    return

In [47]:
def get_entity_tags(text,ent_df):
    ent_df["in_text"] = ent_df["Name"].apply(lambda x:in_text(x,text))
    entities = ent_df[ent_df["in_text"]][["Name","Entity"]].values
    ent_locs = get_ent_locs(text,entities)
    return ent_locs

In [48]:
def clean_text(text):
    text = re.sub("[:*]"," ",text)
    return text

In [49]:
text = medication_sections[2]
text = clean_text(text)
ent_df = create_ent_df()
entities = get_entity_tags(text,ent_df)
show_ents(text,entities)

[23, 24, 'ORDINAL']


---