#Domain Specific Named-Entity Recognizer using Regular Expressions

###Techniques utilized for recognizing domain specific patterns based on Named Entities are,
   1. Named Entity Recognition (NER)
   2. Regular Expressions

In [5]:
import re
import pandas as pd
from datasets import load_dataset

Loading the FiNER-ORD dataset directly from Hugging Face Datasets Hub

In [38]:
ds = load_dataset("wikiann", "en")
print(ds["train"][0]['tokens'])

README.md: 0.00B [00:00, ?B/s]

en/validation-00000-of-00001.parquet:   0%|          | 0.00/748k [00:00<?, ?B/s]

en/test-00000-of-00001.parquet:   0%|          | 0.00/748k [00:00<?, ?B/s]

en/train-00000-of-00001.parquet:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

['R.H.', 'Saunders', '(', 'St.', 'Lawrence', 'River', ')', '(', '968', 'MW', ')']


In [4]:
!pip install datasets pandas --quiet

In [44]:
# Load word-tokenized WikiAnn NER dataset (English, but you can change 'en' to other languages)
ds = load_dataset("wikiann", "en")

In [45]:
label_map = ds["train"].features["ner_tags"].feature.names

regex_patterns = {
    "PER": [
        r"\b(Mr\.|Ms\.|Mrs\.|Dr\.)\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b",
        r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b"  # Capitalized full names
    ],
    "LOC": [
        r"\b(New York|London|Tokyo|Paris|Berlin|Sydney|Beijing|Toronto|Delhi)\b"
    ],
    "ORG": [
        r"\b[A-Z][a-zA-Z0-9&\-. ]+(Inc\.|Corp\.|LLC|Ltd\.|PLC|Company|Corporation|Bank|Group)\b",
        r"\b(United Nations|UNICEF|Google|Microsoft|Apple|NASA|WHO|IMF|World Bank)\b"
    ]
}

In [46]:
def get_entities_from_bio(tokens, tags):
    entities = []
    entity = []
    entity_label = None
    for tok, tag_id in zip(tokens, tags):
        label = label_map[tag_id]
        if label.endswith("-B"):
            if entity:
                entities.append({"text": ' '.join(entity), "label": entity_label})
                entity = []
            entity = [tok]
            entity_label = label.split("-")[1]
        elif label.endswith("-I") and entity:
            entity.append(tok)
        else:
            if entity:
                entities.append({"text": ' '.join(entity), "label": entity_label})
                entity = []
                entity_label = None
    if entity:
        entities.append({"text": ' '.join(entity), "label": entity_label})
    return entities

In [47]:
def regex_ner(text):
    matches = []
    for label, patterns in regex_patterns.items():
        for pat in patterns:
            for m in re.finditer(pat, text):
                matches.append({"text": m.group(), "label": label})
    return matches

In [48]:
# Process and print 5 examples
num_samples = 5
for samp in ds["train"].select(range(num_samples)):
    tokens = samp["tokens"]
    tags = samp["ner_tags"]
    text = " ".join(tokens)
    gold_entities = get_entities_from_bio(tokens, tags)
    pred_entities = regex_ner(text)
    print("\n--- Example ---")
    print("TEXT:", text[:220], "...")
    print("GOLD:", [(e['text'], e['label']) for e in gold_entities])
    print("REGEX:", [(e['text'], e['label']) for e in pred_entities])


--- Example ---
TEXT: R.H. Saunders ( St. Lawrence River ) ( 968 MW ) ...
GOLD: []
REGEX: [('Lawrence River', 'PER')]

--- Example ---
TEXT: ; ' '' Anders Lindström '' ' ...
GOLD: []
REGEX: []

--- Example ---
TEXT: Karl Ove Knausgård ( born 1968 ) ...
GOLD: []
REGEX: [('Karl Ove', 'PER')]

--- Example ---
TEXT: Atlantic City , New Jersey ...
GOLD: []
REGEX: [('Atlantic City', 'PER'), ('New Jersey', 'PER')]

--- Example ---
TEXT: Her daughter from the second marriage was Marie d'Agoult ( 1805–1876 ) , who in turn gave birth to several children , among them—from her liaison to Franz Liszt –- Cosima Wagner ( 1837–1930 ) . ...
GOLD: []
REGEX: [('Franz Liszt', 'PER'), ('Cosima Wagner', 'PER')]


In [49]:
df_results = pd.DataFrame([{
    "text": " ".join(samp["tokens"]),
    "gold_entities": get_entities_from_bio(samp["tokens"], samp["ner_tags"]),
    "pred_entities": regex_ner(" ".join(samp["tokens"]))
} for samp in ds["train"].select(range(num_samples))])
df_results.to_csv("wikiann_ner_regex_sample.csv", index=False)
print("\nResults saved to wikiann_ner_regex_sample.csv!")


Results saved to wikiann_ner_regex_sample.csv!
