# Named Entity Recognition (NER) Demo

This notebook demonstrates the Named Entity Recognition capabilities of the NLP toolkit, including:
- Loading and preprocessing data for NER
- Using pre-trained NER models
- Fine-tuning on custom datasets
- Evaluating NER performance
- Visualizing entity predictions
- Applying NER to real-world text

In [None]:
# Setup path to allow importing from the src directory
import sys
import os
from pathlib import Path

# Add parent directory to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

# Import toolkit modules
from src.data.preprocessing import TextPreprocessor
from src.data.data_loader import get_ner_loader
from src.models.named_entity_recognition import NERModel
from src.training.metrics import token_classification_metrics
from src.utils.visualization import plot_token_classification_results

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
import torch

## 1. Configuration and Setup

In [None]:
# Configuration
TASK = "ner"
MODEL_NAME = "dslim/bert-base-NER"  # Pre-trained NER model
DATASET_NAME = "conll2003"  # Standard NER benchmark dataset
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1  # Using just 1 epoch for demonstration purposes

# Output directory for model and results
OUTPUT_DIR = os.path.join(project_root, "models", "demo_ner")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Exploring NER with Pre-trained Models

In [None]:
# Initialize a pre-trained NER model
ner_model = NERModel(model_name=MODEL_NAME)

# Print model information
print(f"Model: {MODEL_NAME}")
print(f"Label map: {ner_model.id2label}")
print(f"Number of entity types: {len(set([label.split('-')[1] for label in ner_model.id2label.values() if label != 'O']))}")

In [None]:
# Sample texts for NER prediction
sample_texts = [
    "Apple Inc. is planning to open a new store in New York City next month.",
    "The European Union and United States signed a new trade agreement yesterday in Brussels.",
    "Albert Einstein developed the theory of relativity while working at the Swiss Patent Office in Bern.",
    "Microsoft CEO Satya Nadella announced a partnership with OpenAI to develop new AI technologies."
]

# Perform NER on sample texts
predictions = ner_model.predict(sample_texts)

# Display results
for i, (text, entities) in enumerate(zip(sample_texts, predictions)):
    print(f"\nText {i+1}: {text}")
    print("Entities:")
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        print(f"  {entity_text} ({entity['entity']}): {entity['score']:.3f}")

# Note: Prediction-Only Version

This is a simplified version of the NER demo notebook that focuses on prediction rather than training.It demonstrates how to use pre-trained NER models for inference, but skips the training sectionswhich require specific dataset formatting and tensor shapes.

## Using Pre-trained NER Models

In [None]:
# Initialize a pre-trained NER model
ner_model = NERModel(model_name=MODEL_NAME)

# Print model information
print(f"Model: {MODEL_NAME}")
print(f"Label map: {ner_model.id2label}")
print(f"Number of entity types: {len(set([label.split('-')[1] for label in ner_model.id2label.values() if label != 'O']))}")

In [None]:
# Sample texts for NER prediction
sample_texts = [
    "Apple Inc. is planning to open a new store in New York City next month.",
    "The European Union and United States signed a new trade agreement yesterday in Brussels.",
    "Albert Einstein developed the theory of relativity while working at the Swiss Patent Office in Bern.",
    "Microsoft CEO Satya Nadella announced a partnership with OpenAI to develop new AI technologies."
]

# Perform NER on sample texts
predictions = ner_model.predict(sample_texts)

# Display results
for i, (text, entities) in enumerate(zip(sample_texts, predictions)):
    print(f"\nText {i+1}: {text}")
    print("Entities:")
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        print(f"  {entity_text} ({entity['entity']}): {entity['score']:.3f}")

In [None]:
# Visualize NER predictions for the first text
def visualize_ner_prediction(text, entities):
    """Create word-level visualization of NER results."""
    tokens = text.split()
    token_entities = ['O'] * len(tokens)
    
    # Assign entity tags to tokens (simplified approach)
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        entity_type = entity['entity'].split('-')[1] if '-' in entity['entity'] else entity['entity']
        
        # Find the token(s) that match this entity
        for i, token in enumerate(tokens):
            if token in entity_text or entity_text in token:
                prefix = 'B-' if i == 0 or token_entities[i-1] == 'O' else 'I-'
                token_entities[i] = f"{prefix}{entity_type}"
    
    # Plot the results
    plot_token_classification_results(
        tokens=tokens,
        true_labels=None,  # We don't have ground truth here
        pred_labels=token_entities
    )

# Visualize the first example
visualize_ner_prediction(sample_texts[0], predictions[0])

In [None]:
# Visualize NER predictions for the first text
def visualize_ner_prediction(text, entities):
    """Create word-level visualization of NER results."""
    tokens = text.split()
    token_entities = ['O'] * len(tokens)
    
    # Assign entity tags to tokens (simplified approach)
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        entity_type = entity['entity'].split('-')[1] if '-' in entity['entity'] else entity['entity']
        
        # Find the token(s) that match this entity
        for i, token in enumerate(tokens):
            if token in entity_text or entity_text in token:
                prefix = 'B-' if i == 0 or token_entities[i-1] == 'O' else 'I-'
                token_entities[i] = f"{prefix}{entity_type}"
    
    # Plot the results
    plot_token_classification_results(
        tokens=tokens,
        true_labels=None,  # We don't have ground truth here
        pred_labels=token_entities
    )

# Visualize the first example
visualize_ner_prediction(sample_texts[0], predictions[0])

## Custom Examples

In [None]:
# Try with your own custom text examples
custom_texts = [
    "Google and Facebook announced new AI research partnerships at Stanford University yesterday.",
    "Tesla CEO Elon Musk visited their Berlin Gigafactory in Germany last month.",
    "The World Health Organization released new guidelines for COVID-19 prevention in New York."
]

# Run predictions
custom_predictions = ner_model.predict(custom_texts)

# Display results
for i, (text, entities) in enumerate(zip(custom_texts, custom_predictions)):
    print(f"\nText {i+1}: {text}")
    print("Entities:")
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        entity_type = entity['entity'].replace('B-', '').replace('I-', '') if '-' in entity['entity'] else entity['entity']
        print(f"  {entity_text} ({entity_type}): {entity['score']:.3f}")

# Visualize the first custom example
visualize_ner_prediction(custom_texts[0], custom_predictions[0])