In [2]:
!pip install -U spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy
from spacy.matcher import PhraseMatcher

# Load English model
nlp = spacy.load("en_core_web_sm")


In [4]:
# Disease terms
disease_terms = [
    "Type 2 Diabetes",
    "T2DM",
    "Diabetes Mellitus Type 2",
    "Hypertension",
    "High Blood Pressure",
    "HTN",
    "COVID-19",
    "SARS-CoV-2",
    "Coronavirus Disease",
    "Myocardial Infarction",
    "Heart Attack",
    "MI",
    "Chronic Kidney Disease",
    "CKD",
    "Renal Failure"
]

# Medication terms
medication_terms = [
    "Metformin",
    "Insulin Glargine",
    "Lantus",
    "Lisinopril",
    "ACE Inhibitor",
    "Remdesivir",
    "Paxlovid",
    "COVID-19 Vaccine",
    "Aspirin",
    "Acetylsalicylic Acid"
]


In [5]:
# Initialize PhraseMatcher (case-insensitive)
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# Convert terms to spaCy patterns
disease_patterns = [nlp.make_doc(term) for term in disease_terms]
medication_patterns = [nlp.make_doc(term) for term in medication_terms]

# Add patterns to matcher
matcher.add("DISEASE", disease_patterns)
matcher.add("MEDICATION", medication_patterns)


In [6]:
# Sample clinical text
text = "Patient diagnosed with T2DM, prescribed Metformin 500mg twice daily."

# Process text
doc = nlp(text)

# Apply matcher
matches = matcher(doc)

# Display results
for match_id, start, end in matches:
    span = doc[start:end]
    label = nlp.vocab.strings[match_id]
    print(f"{label}: {span.text}")


DISEASE: T2DM
MEDICATION: Metformin


In [7]:
from spacy.pipeline import EntityRuler

ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = [
    {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^MED-\d{4}$"}}]},
    {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^RX-[A-Z]+-\d{2}$"}}]},

    # Disease codes
    {"label": "DISEASE_CODE", "pattern": [{"TEXT": {"REGEX": "^ICD-10:[A-Z]\d{2}.\d{1,2}$"}}]},

    # Specific COVID-19 terms
    {"label": "COVID_TERM", "pattern": "COVID-19 positive"},
    {"label": "COVID_TERM", "pattern": "SARS-CoV-2 infection"},
    {"label": "COVID_TERM", "pattern": [{"LOWER": "covid"}, {"IS_PUNCT": True}, {"LIKE_NUM": True}]},

    # Adverse reactions pattern
    {"label": "ADVERSE_REACTION", "pattern": [{"LOWER": "side"}, {"LOWER": "effect"}]},
    {"label": "ADVERSE_REACTION", "pattern": [{"LOWER": "adverse"}, {"LOWER": "reaction"}]},
    {"label": "ADVERSE_REACTION", "pattern": [{"LOWER": "allergic"}, {"LOWER": "reaction"}]},

    # Dosage patterns
    {"label": "DOSAGE", "pattern": [{"LIKE_NUM": True}, {"LOWER": "mg"}]},
    {"label": "DOSAGE", "pattern": [{"LIKE_NUM": True}, {"LOWER": "ml"}]},
]

ruler.add_patterns(patterns)

# Test on clinical report
clinical_text = """
Patient ID: 12345
Chief Complaint: COVID-19 positive, admitted with severe symptoms
Medical History: T2DM (ICD-10:E11.9), HTN
Current Medications: MED-2045 (Metformin 500mg), RX-COVID-VAC-01
Adverse Reactions: Patient reported side effect of nausea with Metformin
Treatment Plan: Continue 500mg twice daily, monitor blood glucose
"""

doc = nlp(clinical_text)

print("Extracted Medical Entities:")
print("-" * 60)
for ent in doc.ents:
    print(f"{ent.label_:20} | {ent.text}")


Extracted Medical Entities:
------------------------------------------------------------
DATE                 | 12345
COVID_TERM           | COVID-19 positive
MEDICATION_CODE      | MED-2045
DOSAGE               | 500mg
ADVERSE_REACTION     | side effect
PERSON               | Metformin
Treatment Plan: Continue
DOSAGE               | 500mg


  {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^MED-\d{4}$"}}]},
  {"label": "MEDICATION_CODE", "pattern": [{"TEXT": {"REGEX": "^RX-[A-Z]+-\d{2}$"}}]},
  {"label": "DISEASE_CODE", "pattern": [{"TEXT": {"REGEX": "^ICD-10:[A-Z]\d{2}.\d{1,2}$"}}]},


In [8]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# Pattern for symptoms
symptom_patterns = [
    [{"LOWER": "fever"}, {"LOWER": {"IN": ["of", ">"]}}, {"LIKE_NUM": True}],
    [{"LOWER": "difficulty"}, {"LOWER": "breathing"}],
    [{"LOWER": "shortness"}, {"LOWER": "of"}, {"LOWER": "breath"}],
    [{"LOWER": "chest"}, {"LOWER": "pain"}],
    [{"LOWER": "persistent"}, {"LOWER": "cough"}],
]

for pattern in symptom_patterns:
    matcher.add("SYMPTOM", [pattern])

# Process
text = "Patient presents with fever of 102.5°F, difficulty breathing, and persistent cough."
doc = nlp(text)
matches = matcher(doc)

print("\nSymptoms Detected:")
for match_id, start, end in matches:
    print(f"- {doc[start:end].text}")


Symptoms Detected:
- fever of 102.5
- difficulty breathing
- persistent cough
