# Named Entity Recognition (NER) Demo

This notebook demonstrates the Named Entity Recognition capabilities of the NLP toolkit, including:
- Loading and preprocessing data for NER
- Using pre-trained NER models
- Fine-tuning on custom datasets
- Evaluating NER performance
- Visualizing entity predictions
- Applying NER to real-world text

In [None]:
# Setup path to allow importing from the src directory
import sys
import os
from pathlib import Path

# Add parent directory to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

# Import toolkit modules
from src.data.preprocessing import TextPreprocessor
from src.data.data_loader import get_ner_loader
from src.models.named_entity_recognition import NERModel
from src.training.metrics import token_classification_metrics
from src.utils.visualization import plot_token_classification_results

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
import torch

## 1. Configuration and Setup

In [None]:
# Configuration
TASK = "ner"
MODEL_NAME = "dslim/bert-base-NER"  # Pre-trained NER model
DATASET_NAME = "conll2003"  # Standard NER benchmark dataset
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1  # Using just 1 epoch for demonstration purposes

# Output directory for model and results
OUTPUT_DIR = os.path.join(project_root, "models", "demo_ner")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Exploring NER with Pre-trained Models

In [None]:
# Initialize a pre-trained NER model
ner_model = NERModel(model_name=MODEL_NAME)

# Print model information
print(f"Model: {MODEL_NAME}")
print(f"Label map: {ner_model.id2label}")
print(f"Number of entity types: {len(set([label.split('-')[1] for label in ner_model.id2label.values() if label != 'O']))}")

In [None]:
# Sample texts for NER prediction
sample_texts = [
    "Apple Inc. is planning to open a new store in New York City next month.",
    "The European Union and United States signed a new trade agreement yesterday in Brussels.",
    "Albert Einstein developed the theory of relativity while working at the Swiss Patent Office in Bern.",
    "Microsoft CEO Satya Nadella announced a partnership with OpenAI to develop new AI technologies."
]

# Perform NER on sample texts
predictions = ner_model.predict(sample_texts)

# Display results
for i, (text, entities) in enumerate(zip(sample_texts, predictions)):
    print(f"\nText {i+1}: {text}")
    print("Entities:")
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        print(f"  {entity_text} ({entity['entity']}): {entity['score']:.3f}")

In [None]:
# Visualize NER predictions for the first text
def visualize_ner_prediction(text, entities):
    """Create word-level visualization of NER results."""
    tokens = text.split()
    token_entities = ['O'] * len(tokens)
    
    # Assign entity tags to tokens (simplified approach)
    for entity in entities:
        entity_text = text[entity['start']:entity['end']]
        entity_type = entity['entity'].split('-')[1] if '-' in entity['entity'] else entity['entity']
        
        # Find the token(s) that match this entity
        for i, token in enumerate(tokens):
            if token in entity_text or entity_text in token:
                prefix = 'B-' if i == 0 or token_entities[i-1] == 'O' else 'I-'
                token_entities[i] = f"{prefix}{entity_type}"
    
    # Plot the results
    plot_token_classification_results(
        tokens=tokens,
        true_labels=None,  # We don't have ground truth here
        pred_labels=token_entities
    )

# Visualize the first example
visualize_ner_prediction(sample_texts[0], predictions[0])

## 3. Data Loading and Preprocessing for Fine-tuning

In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Create dataset loader
dataset_loader = get_ner_loader(
    tokenizer=tokenizer,
    preprocessor=preprocessor,
    max_length=MAX_LENGTH
)

In [None]:
# Load the ConLL-2003 dataset
dataset = dataset_loader.load_huggingface_dataset(
    dataset_name=DATASET_NAME,
    text_column="tokens",
    label_column="ner_tags"
)

# Display dataset information
print(f"Dataset: {DATASET_NAME}")
print(f"Number of splits: {len(dataset.keys())}")
for split in dataset.keys():
    print(f"  {split}: {len(dataset[split])} examples")

# Show example data
print("\nExample data:")
for i, example in enumerate(dataset["train"][:2]):
    print(f"Example {i+1}:")
    print(f"  Tokens: {example['tokens'][:10]}...")
    print(f"  NER tags: {example['ner_tags'][:10]}...")

In [None]:
# Get the label mapping from the dataset
from datasets import ClassLabel

# Extract the label names if available
if isinstance(dataset['train'].features['ner_tags'].feature, ClassLabel):
    label_names = dataset['train'].features['ner_tags'].feature.names
    print("NER Labels:")
    for i, name in enumerate(label_names):
        print(f"  {i}: {name}")

In [None]:
# Preprocess the dataset
preprocessed_dataset = dataset_loader.preprocess_dataset(dataset)

# Create PyTorch DataLoaders
dataloaders = dataset_loader.create_torch_dataloaders(
    preprocessed_dataset,
    batch_size=BATCH_SIZE
)

# Extract train and validation dataloaders
train_dataloader = dataloaders["train"]
val_dataloader = dataloaders["validation"]

print(f"Training batches: {len(train_dataloader)}")
print(f"Validation batches: {len(val_dataloader)}")

## 4. Model Fine-tuning

In [None]:
# Initialize NER model for fine-tuning
ner_model = NERModel(
    model_name=MODEL_NAME,
    num_labels=len(label_names) if 'label_names' in locals() else None
)

# Print model information
print(f"Model: {MODEL_NAME}")
print(f"Number of parameters: {ner_model.get_model_size():,}")

In [None]:
# Train the model (only for demonstration - typically needs more epochs)
training_history = ner_model.train(
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    num_epochs=NUM_EPOCHS,
    learning_rate=3e-5,
    weight_decay=0.01,
    output_dir=OUTPUT_DIR,
    save_best=True
)

## 5. Model Evaluation

In [None]:
# Evaluate the model
eval_results = ner_model.evaluate(val_dataloader)

# Print metrics
print("Model Evaluation Results:")
print(f"  Loss: {eval_results['loss']:.4f}")
print(f"  F1 Score: {eval_results['f1']:.4f}")
print(f"  Precision: {eval_results['precision']:.4f}")
print(f"  Recall: {eval_results['recall']:.4f}")

In [None]:
# Compute detailed per-entity metrics
if 'label_names' in locals():
    # Collect predictions and true labels
    all_predictions = []
    all_labels = []
    all_valid_indices = []
    
    device = ner_model.device
    model = ner_model.model
    model.eval()
    
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            
            predictions = torch.argmax(logits, dim=2).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            
            # Filter out padding tokens (-100)
            for i in range(labels.shape[0]):
                valid_indices = labels[i] != -100
                all_predictions.extend(predictions[i, valid_indices].tolist())
                all_labels.extend(labels[i, valid_indices].tolist())
    
    # Convert numeric labels to names
    pred_labels = [label_names[p] for p in all_predictions]
    true_labels = [label_names[t] for t in all_labels]
    
    # Calculate per-entity metrics
    entity_types = sorted(set([name.split('-')[1] for name in label_names if name != 'O']))
    
    print("\nPer-Entity Type Metrics:")
    for entity_type in entity_types:
        # Filter for just this entity type
        entity_preds = ['1' if (p.startswith('B-') or p.startswith('I-')) and p.endswith(entity_type) else '0' for p in pred_labels]
        entity_trues = ['1' if (t.startswith('B-') or t.startswith('I-')) and t.endswith(entity_type) else '0' for t in true_labels]
        
        # Calculate basic metrics
        correct = sum(1 for p, t in zip(entity_preds, entity_trues) if p == '1' and t == '1')
        total_pred = sum(1 for p in entity_preds if p == '1')
        total_true = sum(1 for t in entity_trues if t == '1')
        
        precision = correct / total_pred if total_pred > 0 else 0
        recall = correct / total_true if total_true > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"  {entity_type}:")
        print(f"    Precision: {precision:.4f}")
        print(f"    Recall: {recall:.4f}")
        print(f"    F1: {f1:.4f}")
        print(f"    Support: {total_true}")

## 6. Visualizing NER Predictions

In [None]:
# Get a batch of examples from validation set
batch = next(iter(val_dataloader))
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]

# Select one example
example_idx = 0
example_input_ids = input_ids[example_idx].unsqueeze(0).to(ner_model.device)
example_attention_mask = attention_mask[example_idx].unsqueeze(0).to(ner_model.device)
example_labels = labels[example_idx]

# Get predictions
ner_model.model.eval()
with torch.no_grad():
    outputs = ner_model.model(input_ids=example_input_ids, attention_mask=example_attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()

# Convert ids to tokens and labels
tokens = tokenizer.convert_ids_to_tokens(input_ids[example_idx])
valid_indices = example_labels != -100

valid_tokens = [tokens[i] for i in range(len(tokens)) if valid_indices[i]]
valid_true_labels = [label_names[example_labels[i]] for i in range(len(example_labels)) if valid_indices[i]]
valid_pred_labels = [label_names[predictions[i]] for i in range(len(predictions)) if valid_indices[i]]

In [None]:
# Print tokens and their predicted/true labels
print("Token\tTrue Label\tPredicted Label")
print("-" * 50)
for token, true_label, pred_label in zip(valid_tokens, valid_true_labels, valid_pred_labels):
    print(f"{token}\t{true_label}\t{pred_label}")

In [None]:
# Visualize using our utility function
plot_token_classification_results(
    tokens=valid_tokens,
    true_labels=valid_true_labels,
    pred_labels=valid_pred_labels
)

## 7. Applying NER to Real-world Text

In [None]:
# Real-world example text
news_article = """
Apple Inc. announced today that CEO Tim Cook will present the latest iPhone model at their headquarters in Cupertino, California next month. 
The event, scheduled for September 12th, is expected to draw technology journalists from around the world including representatives from the New York Times and CNN.
Financial analysts from Goldman Sachs and JP Morgan Chase predict the new device will boost Apple's stock price on NASDAQ.
Meanwhile, competitors Samsung and Google are reportedly preparing their own product announcements in response.
"""

# Apply NER to the text
entities = ner_model.predict([news_article])[0]

# Display entities by type
entity_by_type = {}
for entity in entities:
    entity_type = entity['entity'].split('-')[1] if '-' in entity['entity'] else entity['entity']
    entity_text = news_article[entity['start']:entity['end']]
    
    if entity_type not in entity_by_type:
        entity_by_type[entity_type] = []
    entity_by_type[entity_type].append(entity_text)

print("Entities by Type:")
for entity_type, entities in entity_by_type.items():
    print(f"\n{entity_type}:")
    for entity in sorted(set(entities)):
        print(f"  {entity}")

In [None]:
# Visualize entities in the text
from IPython.display import HTML, display
import re

def highlight_entities(text, entities):
    """Highlight entities in text with HTML colors."""
    # Define colors for different entity types
    colors = {
        'PER': '#FFADAD',  # Light red
        'ORG': '#FFD6A5',  # Light orange
        'LOC': '#CAFFBF',  # Light green
        'MISC': '#9BF6FF',  # Light blue
    }
    
    # Sort entities by start position in reverse (to avoid index shifts)
    sorted_entities = sorted(entities, key=lambda x: x['start'], reverse=True)
    
    # Insert HTML tags for highlighting
    result = text
    for entity in sorted_entities:
        entity_type = entity['entity'].split('-')[1] if '-' in entity['entity'] else entity['entity']
        color = colors.get(entity_type, '#E2E2E2')  # Default gray for unknown types
        
        start, end = entity['start'], entity['end']
        entity_html = f'<span style="background-color: {color}; padding: 2px; border-radius: 3px;" title="{entity_type}">{text[start:end]}</span>'
        
        result = result[:start] + entity_html + result[end:]
    
    # Replace newlines with HTML breaks
    result = result.replace('\n', '<br>')
    
    return result

# Display highlighted text
html_result = highlight_entities(news_article, entities)
display(HTML(f'<div style="font-size: 14px; line-height: 1.5;">{html_result}</div>'))

## 8. Save and Load Model

In [None]:
# Save the model
save_path = os.path.join(OUTPUT_DIR, "final_model")
ner_model.save(save_path)
print(f"Model saved to {save_path}")

In [None]:
# Load the model
loaded_model = NERModel.load(save_path)
print("Model loaded successfully")

# Verify with a prediction
test_text = "Bill Gates founded Microsoft in 1975."
entities = loaded_model.predict([test_text])[0]

print("Detected entities:")
for entity in entities:
    entity_text = test_text[entity['start']:entity['end']]
    print(f"  {entity_text} ({entity['entity']}): {entity['score']:.3f}")

## 9. Conclusion

In this notebook, we demonstrated the Named Entity Recognition capabilities of the NLP toolkit:

1. We explored pre-trained NER models and their predictions
2. We loaded and preprocessed the CoNLL-2003 dataset for NER fine-tuning
3. We fine-tuned a BERT-based NER model on the dataset
4. We evaluated model performance with detailed metrics by entity type
5. We visualized NER predictions on real-world text
6. We saved and loaded the model for future use

NER is useful for many downstream applications including information extraction, knowledge graph construction, and content recommendation. The model provides a solid foundation for these applications, though for production use you would typically train for more epochs and potentially on domain-specific data.