In [1]:
!pip install --upgrade nbformat ipywidgets


Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting comm>=0.1.3 (from ipywidgets)
  Downloading comm-0.2.3-py3-none-any.whl.metadata (3.7 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jedi>=0.16 (from ipython>=6.1.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipywidgets-8.1.8-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading comm-0.2.3-py3-none-any.whl (7.3 kB)
Downloading widgetsnbextension-4.0.15-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.7 MB

# **scispacy:**

In [None]:
import spacy

# Load NER-specific model
nlp = spacy.load("en_ner_bc5cdr_md")

text = "Patient was prescribed aspirin for hypertension"
doc = nlp(text)

print("Medical Entities found:")
for ent in doc.ents:
    print(ent.text, ent.label_)


Medical Entities found:
aspirin CHEMICAL
hypertension DISEASE


# **ClinicalBERT:**

In [None]:
# Install transformers and torch if not already installed
# !pip install transformers torch

from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load ClinicalBERT model (fine-tuned on clinical notes)
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Medical text
text = "Patient shows symptoms of type 2 diabetes and hypertension"

# Run NER
entities = ner_pipeline(text)

# Print entities
print("ClinicalBERT Medical Entities found:")
for ent in entities:
    print(ent['word'], ent['entity_group'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ClinicalBERT Medical Entities found:
patient shows LABEL_0
symptoms LABEL_1
of type LABEL_0
2 LABEL_1
diabetes LABEL_0
and LABEL_1
hypertens LABEL_0
##ion LABEL_1


# **Regex Patterns for Extracting Numbers + Units**

# **1️.Basic Number Extraction**

In [None]:
import re

text = "Patient was given 500 mg paracetamol twice daily for 5 days"
numbers = re.findall(r'\d+\.?\d*', text)
print(numbers)


['500', '5']


## **2️.Number + Unit Extraction**

In [None]:
pattern = r'\b\d+\.?\d*\s?(mg|ml|g|kg|units|mcg|%)\b'
matches = re.findall(pattern, text, re.IGNORECASE)
print(matches)


['mg']


# **3️.Full Dosage Extraction**

In [None]:
pattern = r'\b(\d+\.?\d*)\s?(mg|ml|units|mcg)\b'
matches = re.findall(pattern, text)
print(matches)


[('500', 'mg')]


# **4️.Frequency Extraction**

In [None]:
freq_pattern = r'\b(once|twice|thrice|\d+\s?times)\s?(daily|a day)\b'
re.findall(freq_pattern, text, re.IGNORECASE)


[('twice', 'daily')]

In [None]:
text = "Insulin 2 units subcutaneously twice daily"

dose = re.findall(r'\b\d+\s?(units|mg|ml)\b', text)
freq = re.findall(r'\b(once|twice)\s?daily\b', text)

print(dose, freq)


['units'] ['twice']


In [None]:
!pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=877de53daafbe7a6da281c34b7302fd9aa0d7e8789929a2bad64272883182eb5
  Stored in directory: /root/.cache/pip/wheels/5b/79/fb/d25928e599c7e11fe4e00d32048cd74933f34a74c633d2aea6
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


# **Normalization Before Evaluation**

# **1.Convert Number Words to Digits**

In [None]:
from word2number import w2n

text = "two milligrams"
number = w2n.word_to_num("two")
print(number)


2


# **2.Normalize Units Using Mapping**

In [None]:
unit_map = {
    "milligrams": "mg",
    "milligram": "mg",
    "micrograms": "mcg",
    "grams": "g"
}

text = "two milligrams"
for k, v in unit_map.items():
    text = text.replace(k, v)

print(text)


two mg


# **3.Combine Number + Unit**

In [None]:
normalized = f"{number} mg"
print(normalized)


2 mg


In [None]:
import re

text = "The patient will return in 2 weeks"

pattern = r"\b\d+\s+(day|days|week|weeks|month|months)\b"
matches = re.findall(pattern, text)

print(matches)


['weeks']
