This notebook shows the testing of alternative NLP techniques considered. FinalNLP is the code for the final NLP pipeline. 

# Testing NER and word embeddings

In [None]:
# Testing NER as a means to detect if the symptom is present in the record 

import spacy

# Load spaCy's English language model
#nlp = spacy.load("en_core_web_sm") 
#nlp = spacy.load("en_core_web_md")

#These did not cater for medical entities

# Clinical record text
clinical_record = "6. Type 2 diabetes mellitus. 7. Aortic stenosis. 8. Mild asthma. 9. History of atrial fibrillation. 10. History of pneumonectomy in 1999 for tuberculosis. 11. History of AV block. 12. Gastroesophageal reflux. 13. Glaucoma. 14. Hypothyroidism. 15. Osteoarthritis. 16. Osteopenia. 17. Mild pulmonary hypertension. 18. History of squamous-cell carcinoma of the left anterior thigh status post excision in 2006."

# Process the text with spaCy
doc = nlp(clinical_record)

# Extract entities (medical conditions)
medical_conditions = [ent.text.lower() for ent in doc.ents if ent.label_ == "MEDICAL_CONDITION"]

# Check if "asthma" is mentioned
if "asthma" in medical_conditions:
    print("The patient has asthma.")
else:
    print("The patient does not have asthma.")

# Print the detected medical conditions
print("Detected Medical Conditions:")
for condition in medical_conditions:
    print("-", condition)

Using spacy for Named Entity Recognition to identify and extract medical conditions mentioned in the clinical record.

In [None]:
#Testing Med7 word embedding

import spacy


med7 = spacy.load("en_core_med7_lg")

# create distinct colours for labels
col_dict = {}
seven_colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4']
for label, colour in zip(med7.pipe_labels['ner'], seven_colours):
    col_dict[label] = colour

options = {'ents': med7.pipe_labels['ner'], 'colors':col_dict}

text = 'A patient was prescribed Magnesium hydroxide 400mg/5ml suspension PO of total 30ml bid for the next 5 days.'
#text = '3. History of social phobia. 4. History of panic disorder. 5. History of polysubstance abuse. 6. Asthma. ADMISSION MEDICATIONS: The patient was not taking any prescription OTC or herbal supplements prior to admission. ALLERGIES: No known drug allergies. SOCIAL HISTORY: The patient smokes approximately 1/2 pack of cigarettes per day.'
#text = 'Prior to her surgery, she did not have any history of cough, wheezing, orthopnea. PAST MEDICAL HISTORY: 1. Coronary artery bypass graft X 2, LIMA, AVR with bioprosthesis dated _%#MMDD2003#%_. 2. Atrial fibrillation. a. Status post pacemaker placement. b. History of Amiodarone. 3. Hypercholesterolemia. 4. No previous history of pneumonia, asthma, bronchitis or TB.'
doc = med7(text)

spacy.displacy.render(doc, style='ent', jupyter=True, options=options)

[(ent.text, ent.label_) for ent in doc.ents]

In [2]:
#Testing medspacy for NER

import medspacy

from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent, visualize_dep

nlp = medspacy.load()
print(nlp.pipe_names)

nlp.get_pipe('medspacy_target_matcher').add([TargetRule('stroke', 'CONDITION'), TargetRule('diabetes', 'CONDITION'), TargetRule('pna', 'CONDITION')])
#doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')
doc = nlp('6. Type 2 diabetes mellitus. 7. Aortic stenosis. 8. Mild asthma. 9. History of atrial fibrillation. 10. History of pneumonectomy in 1999 for tuberculosis. 11. History of AV block. 12. Gastroesophageal reflux. 13. Glaucoma. 14. Hypothyroidism. 15. Osteoarthritis. 16. Osteopenia. 17. Mild pulmonary hypertension. 18. History of squamous-cell carcinoma of the left anterior thigh status post excision in 2006.')
#doc = nlp('4. Cystic fibrosis exacerbation: She has been on chronic antibiotics because of her severe lung disease. Will continue meropenem and tobramycin along with Zithromax and doxycycline. She will remain on Bactrim prophylaxis for possible PCP as the patient is on prednisone. In addition will continue with Vest therapy and nebs. 5. History of ABPA, on prednisone. Most likely the cause for her significant asthmatic component.')

for ent in doc.ents:
    print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)
medspacy.visualization.visualize_ent(doc)

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']
diabetes False False False


In [None]:
#Testing scispacy

import scispacy
import spacy

import en_core_sci_sm
nlp = spacy.load("en_core_sci_sm")
text = "Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity. They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC)."
doc = nlp(text)

print(list(doc.sents))

# Examine the entities extracted by the mention detector.
# Note that they don't have types like in SpaCy, and they
# are more general (e.g including verbs) - these are any
# spans which might be an entity in UMLS, a large
# biomedical database.
print(doc.ents)


# We can also visualise dependency parses
# (This renders automatically inside a jupyter notebook!):
from spacy import displacy
displacy.render(next(doc.sents), style='dep', jupyter=True)

# See below for the generated SVG.
# Zoom your browser in a bit!

# Testing POS Tagging

In [8]:
# TESTING PART OF SPEECH


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


text = "The patient confirms the presence of pain. No other symptoms are reported."
#text = "The patient was diagnosed with depression"

# Tokenize and perform POS tagging
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

pos_tags

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tanobugelli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tanobugelli/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('The', 'DT'),
 ('patient', 'NN'),
 ('confirms', 'VBZ'),
 ('the', 'DT'),
 ('presence', 'NN'),
 ('of', 'IN'),
 ('pain', 'NN'),
 ('.', '.'),
 ('No', 'DT'),
 ('other', 'JJ'),
 ('symptoms', 'NNS'),
 ('are', 'VBP'),
 ('reported', 'VBN'),
 ('.', '.')]

In [4]:
# Example POS patterns for affirmative and negative contexts
affirmative_pos_patterns = ['VB', 'VBD', 'VBG', 'VBP']  # Verbs indicating affirmation
negative_pos_patterns = ['VB', 'VBD', 'VBG', 'VBN']  # Verbs indicating negation

# Extract affirmative and negative keywords based on POS patterns
affirmative_keywords = [word for word, pos in pos_tags if pos in affirmative_pos_patterns]
negative_keywords = [word for word, pos in pos_tags if pos in negative_pos_patterns]

print("Affirmative Keywords:", affirmative_keywords)
print("Negative Keywords:", negative_keywords)




Affirmative Keywords: ['are']
Negative Keywords: ['reported']


# Testing Keyword Matching

In [7]:


#THIS GIVES WORDS PRECEEDING THE KEYWORDS



from nltk import word_tokenize, ngrams

# Example text
text = "positive for neuropsych with depression"

# Tokenize the text into a list of words
tokens = word_tokenize(text)

# Specify the keywords of interest
keywords = ["pneumonia", "depression", "fibrosis"]

# Function to extract n-grams
def extract_ngrams(tokens, keywords, n):
    ngram_list = list(ngrams(tokens, n)) #Generates n-grams from the list of words 
    keyword_contexts = []

    for ngram in ngram_list:
        if any(keyword in ngram for keyword in keywords):
            # Extract words before the keyword
            context_words = [word for word in ngram if word not in keywords]
            keyword_contexts.append(" ".join(context_words))

    return keyword_contexts

# Specify the n-gram size (number of words before the keyword)
ngram_size = 5

# Extract n-grams for each keyword
for keyword in keywords:
    contexts = extract_ngrams(tokens, [keyword], ngram_size)
    if contexts:
        print(f"Context for '{keyword}': {contexts}")

Context for 'depression': ['positive for neuropsych with']


# Testing Regular Expressions

In [6]:
import re

# Example text
#text = "positive for neuropsych and monkeys with depression"
#text = "worked up for bananas and depression"
#text = "diagnosed with a bad depression"
text = "patient had no acute depression"



#NEED TO ENSURE THAT IT DOES NOT CONSIDER IF THERE'S A FULLSTOP IN BETWEEN

# Define the case-insensitive patterns using regular expressions

#Affirmative Case (Patient Has)
affirmative_patterns = [
    re.compile(r"positive\s+for\s+.*?depression", re.IGNORECASE),
    re.compile(r"worked\s+up\s+for\s+.*?depression", re.IGNORECASE),  
    re.compile(r"diagnosed\s+with\s+depression", re.IGNORECASE),
    re.compile(r"diagnosed.*?with.*?depression", re.IGNORECASE),
]
# Past Case (Patient Had)
past_patterns = [
    re.compile(r"history\s+of\s+.*?depression", re.IGNORECASE),
    re.compile(r"past\s+medical\s+history\s+.*?depression", re.IGNORECASE),
    re.compile(r"past\s+history\s+.*?depression", re.IGNORECASE),
    re.compile(r"with\s+a\s+history\s+of\.*?depression", re.IGNORECASE),
    re.compile(r"had previous\s+.*?diagnosis\s+of\s+.*?depression", re.IGNORECASE),  
    re.compile(r"childhood\s+depression", re.IGNORECASE),
]

#Negative Case (Patient Does Not Have)
negative_patterns = [
    re.compile(r"no\s+depression", re.IGNORECASE),
    re.compile(r"no\s+.*?depression", re.IGNORECASE),
]

# Check if any positive pattern is present in the text
if any(pattern.search(text) for pattern in affirmative_patterns):
    print("Affirmative context found.")
elif any(pattern.search(text) for pattern in past_patterns):
    print("Past context found.")
# Check if any negative pattern is present in the text
elif any(pattern.search(text) for pattern in negative_patterns):
    print("Negative context found.")
else:
    print("No context found.")



Negative context found.
