In [6]:
import pandas as pd 
import numpy as np 
import spacy
from spacy import displacy


In [3]:
import en_core_med7_lg

In [5]:
nlp = spacy.load("en_core_med7_lg")
nlp

<spacy.lang.en.English at 0x7f3740399990>

In [9]:
col_dict = {}
s_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']

for label, colour in zip(nlp.pipe_labels['ner'], s_colours):
    col_dict[label] = colour

In [10]:
col_dict

{'DOSAGE': '#e6194B',
 'DRUG': '#3cb44b',
 'DURATION': '#ffe119',
 'FORM': '#ffd8b1',
 'FREQUENCY': '#f58231',
 'ROUTE': '#f032e6',
 'STRENGTH': '#42d4f4'}

In [14]:
nlp.pipe_labels["ner"]

['DOSAGE', 'DRUG', 'DURATION', 'FORM', 'FREQUENCY', 'ROUTE', 'STRENGTH']

In [12]:
nlp.pipe_names

['tok2vec', 'ner']

In [15]:
med = pd.read_csv("/home/jerlshin/Documents/My_Work/GenAI_Hackathon_16April2024/Dummy_Medical_report/mtsamples.csv",
                  index_col=0)        

In [16]:
med.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [17]:
import re

med['transcription'] = med['transcription'].astype('str')
med['transcription'] = med['transcription'].apply(lambda x: re.sub('(\.,)', ". ", x))

In [19]:
med["transcription"]

0       SUBJECTIVE:,  This 23-year-old white female pr...
1       PAST MEDICAL HISTORY:, He has difficulty climb...
2       HISTORY OF PRESENT ILLNESS: , I have seen ABC ...
3       2-D M-MODE: , ,1.  Left atrial enlargement wit...
4       1.  The left ventricular cavity size and wall ...
                              ...                        
4994    HISTORY:,  I had the pleasure of meeting and e...
4995    ADMITTING DIAGNOSIS: , Kawasaki disease. DISCH...
4996    SUBJECTIVE: , This is a 42-year-old white fema...
4997    CHIEF COMPLAINT: , This 5-year-old male presen...
4998    HISTORY: , A 34-year-old male presents today s...
Name: transcription, Length: 4999, dtype: object

In [36]:
med["transcription"][0]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up. MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra. ALLERGIES: , She has no known medicine allergies. OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78. HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear. Neck:  Supple without adenopathy. 

In [37]:
medical_doc = med['transcription'].tolist()[0]

medical_doc

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up. MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra. ALLERGIES: , She has no known medicine allergies. OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78. HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear. Neck:  Supple without adenopathy. 

In [38]:
nlp = spacy.load("en_core_med7_lg")

def generate_annotation(texts):
    annotations = []
    for text in texts:
        doc = nlp(text)
        entities = []
        for ent in doc.ents:
            entities.append((ent.start_char, ent.end_char, ent.label_))
        annotations.append((text, {'entities': entities}))
    return annotations

# sampled for only the 1st document 
medical_doc = med['transcription'].tolist()[0]


"""This is the document from the user: we will take only the important text and this model will do the rest
can also be used for the chat application
"""
annotations = generate_annotation(medical_doc)

print("Document:")
print(annotations[0][0]) # first document text
print("Annotations:")
print(annotations[0][1]) 

Document:
S
Annotations:
{'entities': []}


In [39]:
annotations

[('S', {'entities': []}),
 ('U', {'entities': []}),
 ('B', {'entities': []}),
 ('J', {'entities': []}),
 ('E', {'entities': []}),
 ('C', {'entities': []}),
 ('T', {'entities': []}),
 ('I', {'entities': []}),
 ('V', {'entities': [(0, 1, 'ROUTE')]}),
 ('E', {'entities': []}),
 (':', {'entities': []}),
 (',', {'entities': []}),
 (' ', {'entities': []}),
 (' ', {'entities': []}),
 ('T', {'entities': []}),
 ('h', {'entities': []}),
 ('i', {'entities': []}),
 ('s', {'entities': []}),
 (' ', {'entities': []}),
 ('2', {'entities': [(0, 1, 'DOSAGE')]}),
 ('3', {'entities': [(0, 1, 'DOSAGE')]}),
 ('-', {'entities': []}),
 ('y', {'entities': []}),
 ('e', {'entities': []}),
 ('a', {'entities': []}),
 ('r', {'entities': []}),
 ('-', {'entities': []}),
 ('o', {'entities': []}),
 ('l', {'entities': []}),
 ('d', {'entities': []}),
 (' ', {'entities': []}),
 ('w', {'entities': []}),
 ('h', {'entities': []}),
 ('i', {'entities': []}),
 ('t', {'entities': []}),
 ('e', {'entities': []}),
 (' ', {'entities

In [40]:
options = {'ents': nlp.pipe_labels['ner'], 'colors':col_dict}

transcription = med['transcription'][0]
doc = nlp(transcription)

spacy.displacy.render(doc, style = 'ent', jupyter = True, options = options)

[(ent.text, ent.label_) for ent in doc.ents]

[('Claritin', 'DRUG'),
 ('Zyrtec', 'DRUG'),
 ('daily', 'FREQUENCY'),
 ('Zyrtec', 'DRUG'),
 ('loratadine', 'DRUG'),
 ('Nasonex', 'DRUG'),
 ('two', 'DOSAGE'),
 ('sprays', 'FORM'),
 ('each', 'DOSAGE'),
 ('for three weeks', 'DURATION')]