In [1]:
# !pip install spacy

In [2]:
import spacy
from spacy.matcher import PhraseMatcher

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define medical term dictionaries
disease_terms = [
    "Type 2 Diabetes", "T2DM", "Diabetes Mellitus Type 2",
    "Hypertension", "High Blood Pressure", "HTN",
    "COVID-19", "SARS-CoV-2", "Coronavirus Disease",
    "Myocardial Infarction", "Heart Attack", "MI",
    "Chronic Kidney Disease", "CKD", "Renal Failure"
]

medication_terms = [
    "Metformin", "Insulin Glargine", "Lantus",
    "Lisinopril", "ACE Inhibitor",
    "Remdesivir", "Paxlovid", "COVID-19 Vaccine",
    "Aspirin", "Acetylsalicylic Acid"
]

# Create PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns_disease = [nlp.make_doc(text) for text in disease_terms]
patterns_meds = [nlp.make_doc(text) for text in medication_terms]

matcher.add("DISEASE", patterns_disease)
matcher.add("MEDICATION", patterns_meds)

# Process document
doc = nlp("Patient diagnosed with T2DM, prescribed Metformin 500mg twice daily.")
matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    label = nlp.vocab.strings[match_id]
    print(f"{label}: {span.text}")


DISEASE: T2DM
MEDICATION: Metformin


In [3]:
from spacy.pipeline import EntityRuler

# Add EntityRuler to pipeline
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Define custom patterns
patterns = [
    # Medication codes
    {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^MED-\d{4}$"}}]},
    {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^RX-[A-Z]+-\d{2}$"}}]},

    # Disease codes
    {"label": "DISEASE_CODE", "pattern": [{"TEXT": {"REGEX": "^ICD-10:[A-Z]\d{2}.\d{1,2}$"}}]},

    # Specific COVID-19 terms
    {"label": "COVID_TERM", "pattern": "COVID-19 positive"},
    {"label": "COVID_TERM", "pattern": "SARS-CoV-2 infection"},
    {"label": "COVID_TERM", "pattern": [{"LOWER": "covid"}, {"IS_PUNCT": True}, {"LIKE_NUM": True}]},

    # Adverse reactions pattern
    {"label": "ADVERSE_REACTION", "pattern": [{"LOWER": "side"}, {"LOWER": "effect"}]},
    {"label": "ADVERSE_REACTION", "pattern": [{"LOWER": "adverse"}, {"LOWER": "reaction"}]},
    {"label": "ADVERSE_REACTION", "pattern": [{"LOWER": "allergic"}, {"LOWER": "reaction"}]},

    # Dosage patterns
    {"label": "DOSAGE", "pattern": [{"LIKE_NUM": True}, {"LOWER": "mg"}]},
    {"label": "DOSAGE", "pattern": [{"LIKE_NUM": True}, {"LOWER": "ml"}]},
]

ruler.add_patterns(patterns)

# Test on clinical report
clinical_text = """
Patient ID: 12345
Chief Complaint: COVID-19 positive, admitted with severe symptoms
Medical History: T2DM (ICD-10:E11.9), HTN
Current Medications: MED-2045 (Metformin 500mg), RX-COVID-VAC-01
Adverse Reactions: Patient reported side effect of nausea with Metformin
Treatment Plan: Continue 500mg twice daily, monitor blood glucose
"""

doc = nlp(clinical_text)

print("Extracted Medical Entities:")
print("-" * 60)
for ent in doc.ents:
    print(f"{ent.label_:20} | {ent.text}")


Extracted Medical Entities:
------------------------------------------------------------
DATE                 | 12345
COVID_TERM           | COVID-19 positive
MEDICATION_CODE      | MED-2045
DOSAGE               | 500mg
ADVERSE_REACTION     | side effect
PERSON               | Metformin
Treatment Plan: Continue
DOSAGE               | 500mg


  {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^MED-\d{4}$"}}]},
  {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^RX-[A-Z]+-\d{2}$"}}]},
  {"label": "DISEASE_CODE", "pattern": [{"TEXT": {"REGEX": "^ICD-10:[A-Z]\d{2}.\d{1,2}$"}}]},


In [4]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Pattern for symptoms
symptom_patterns = [
    [{"LOWER": "fever"}, {"LOWER": {"IN": ["of", ">"]}}, {"LIKE_NUM": True}],
    [{"LOWER": "difficulty"}, {"LOWER": "breathing"}],
    [{"LOWER": "shortness"}, {"LOWER": "of"}, {"LOWER": "breath"}],
    [{"LOWER": "chest"}, {"LOWER": "pain"}],
    [{"LOWER": "persistent"}, {"LOWER": "cough"}],
]

for pattern in symptom_patterns:
    matcher.add("SYMPTOM", [pattern])

# Process
text = "Patient presents with fever of 102.5Â°F, difficulty breathing, and persistent cough."
doc = nlp(text)
matches = matcher(doc)

print("\nSymptoms Detected:")
for match_id, start, end in matches:
    print(f"- {doc[start:end].text}")



Symptoms Detected:
- fever of 102.5
- difficulty breathing
- persistent cough
