# Model 5: Fine-tuned BERT for Named Entity Recognition

This notebook implements a fine-tuned BERT model for NER using the HuggingFace Transformers library.

**Approach:**
- Use `bert-base-cased` for NER (preserves capitalization)
- Fine-tune on our training data using token classification head
- Handle subword tokenization alignment
- Use entity-span level F1 for evaluation

**Expected Performance:** 90%+ F1 score

In [3]:
# Import libraries
import json
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

# PyTorch and Transformers
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset as HFDataset

# Import utils for evaluation
from utils import extract_entities, evaluate_entity_spans, print_evaluation_report

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"CUDA version: {torch.version.cuda}")

print("Libraries imported successfully!")

Using device: cuda
GPU: NVIDIA L4
CUDA version: 12.8
Libraries imported successfully!


## 1. Load and Prepare Data

In [4]:
def load_jsonl(file_path):
    """Load JSONL file into a list of dictionaries"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

def clean_data(data):
    """Remove samples with invalid BIO sequences"""
    cleaned_data = []
    invalid_count = 0

    for sample in data:
        prev_tag = 'O'
        has_issue = False

        for tag in sample['ner_tags']:
            if tag.startswith('I-'):
                entity_type = tag[2:]
                # Check if I- follows B- or I- of same type
                if not (prev_tag == f'B-{entity_type}' or prev_tag == f'I-{entity_type}'):
                    has_issue = True
                    break
            prev_tag = tag

        if not has_issue:
            cleaned_data.append(sample)
        else:
            invalid_count += 1

    print(f"Removed {invalid_count} samples with invalid BIO sequences")
    return cleaned_data

# Load all training data and test data
train_data_all = load_jsonl('train_data.jsonl')
test_data = load_jsonl('test_data.jsonl')

print(f"Total training samples: {len(train_data_all):,}")
print(f"Test samples: {len(test_data):,}")

# Clean training data
train_data_cleaned = clean_data(train_data_all)
print(f"Training samples after cleaning: {len(train_data_cleaned):,}")

# Create stratification labels based on presence of entities
from sklearn.model_selection import train_test_split

stratify_labels = []
for sample in train_data_cleaned:
    has_entities = any(tag != 'O' for tag in sample['ner_tags'])
    stratify_labels.append(int(has_entities))

# Split into train and validation (90/10 split, same as other models)
train_data, val_data = train_test_split(
    train_data_cleaned,
    test_size=0.1,
    random_state=42,
    stratify=stratify_labels
)

print(f"\nTraining samples: {len(train_data):,}")
print(f"Validation samples: {len(val_data):,}")
print(f"Test samples: {len(test_data):,}")

# Show example
print("\nExample training sample:")
print(json.dumps(train_data[0], indent=2))

Total training samples: 100,541
Test samples: 5,000
Removed 185 samples with invalid BIO sequences
Training samples after cleaning: 100,356

Training samples: 90,320
Validation samples: 10,036
Test samples: 5,000

Example training sample:
{
  "id": 76287,
  "tokens": [
    "she",
    "then",
    "joined",
    "the",
    "goa",
    "football",
    "association",
    "and",
    "refereed",
    "matches",
    "for",
    "men",
    "in",
    "the",
    "local",
    "leagues",
    "."
  ],
  "ner_tags": [
    "O",
    "O",
    "O",
    "O",
    "B-ORG",
    "I-ORG",
    "I-ORG",
    "O",
    "O",
    "O",
    "O",
    "O",
    "O",
    "O",
    "O",
    "O",
    "O"
  ]
}


## 2. Analyze Tag Set and Initialize BERT

In [5]:
# Get all unique tags
all_tags = set()
tag_counts = Counter()

for sample in train_data:
    all_tags.update(sample['ner_tags'])
    tag_counts.update(sample['ner_tags'])

print(f"Unique tags: {len(all_tags)}")
print("\nTag distribution:")
for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = count / sum(tag_counts.values()) * 100
    print(f"  {tag:20s}: {count:8,} ({percentage:5.2f}%)")

# Create tag lists
label_list = sorted(list(all_tags))
num_labels = len(label_list)
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(f"\nNumber of labels: {num_labels}")
print(f"Labels: {label_list}")

Unique tags: 15

Tag distribution:
  O                   : 1,029,204 (80.25%)
  B-HumanSettlement   :   32,261 ( 2.52%)
  I-ORG               :   28,182 ( 2.20%)
  I-Artist            :   27,158 ( 2.12%)
  B-Artist            :   25,817 ( 2.01%)
  I-OtherPER          :   20,707 ( 1.61%)
  I-Facility          :   19,725 ( 1.54%)
  B-ORG               :   17,235 ( 1.34%)
  I-Politician        :   16,975 ( 1.32%)
  B-OtherPER          :   15,897 ( 1.24%)
  I-HumanSettlement   :   15,303 ( 1.19%)
  B-Facility          :   12,827 ( 1.00%)
  B-Politician        :   12,711 ( 0.99%)
  B-PublicCorp        :    4,854 ( 0.38%)
  I-PublicCorp        :    3,662 ( 0.29%)

Number of labels: 15
Labels: ['B-Artist', 'B-Facility', 'B-HumanSettlement', 'B-ORG', 'B-OtherPER', 'B-Politician', 'B-PublicCorp', 'I-Artist', 'I-Facility', 'I-HumanSettlement', 'I-ORG', 'I-OtherPER', 'I-Politician', 'I-PublicCorp', 'O']


In [6]:
# Initialize BERT tokenizer and model
model_name = "bert-base-cased"  # Use cased for NER (preserves capitalization)

print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with token classification head
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

print(f"Model loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size:,}")

Loading model: bert-base-cased


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Model parameters: 107,731,215
Tokenizer vocab size: 28,996


## 3. Tokenize and Align Labels

In [7]:
def tokenize_and_align_labels(examples):
    """Tokenize and align labels with BERT's subword tokenization"""
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=True,
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens (CLS, SEP, PAD)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # First token of word
                label_ids.append(label2id[label[word_idx]])
            else:
                # Subsequent tokens of same word (subword tokens)
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Demonstrate tokenization and alignment
print("Demonstrating BERT tokenization and label alignment:")
print("=" * 60)

example_sample = train_data[0]
tokens = example_sample['tokens'][:10]  # Take first 10 tokens
labels = example_sample['ner_tags'][:10]

print(f"Original tokens: {tokens}")
print(f"Original labels: {labels}")
print()

# Tokenize
tokenized = tokenizer(
    tokens,
    is_split_into_words=True,
    truncation=True,
    padding=False,
    max_length=128
)

bert_tokens = tokenizer.convert_ids_to_tokens(tokenized['input_ids'])
word_ids = tokenized.word_ids()

print(f"BERT tokens: {bert_tokens}")
print(f"Word IDs:    {word_ids}")
print()

# Show alignment
print("Alignment:")
for i, (token, word_id) in enumerate(zip(bert_tokens, word_ids)):
    if word_id is not None:
        if i < len(labels) and word_id < len(labels):
            original_label = labels[word_id]
        else:
            original_label = "N/A"
        print(f"  {token:15s} -> word_id {word_id:2d} -> {original_label}")
    else:
        print(f"  {token:15s} -> special token -> -100")

Demonstrating BERT tokenization and label alignment:
Original tokens: ['she', 'then', 'joined', 'the', 'goa', 'football', 'association', 'and', 'refereed', 'matches']
Original labels: ['O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O']

BERT tokens: ['[CLS]', 'she', 'then', 'joined', 'the', 'go', '##a', 'football', 'association', 'and', 'referee', '##d', 'matches', '[SEP]']
Word IDs:    [None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, None]

Alignment:
  [CLS]           -> special token -> -100
  she             -> word_id  0 -> O
  then            -> word_id  1 -> O
  joined          -> word_id  2 -> O
  the             -> word_id  3 -> O
  go              -> word_id  4 -> B-ORG
  ##a             -> word_id  4 -> B-ORG
  football        -> word_id  5 -> I-ORG
  association     -> word_id  6 -> I-ORG
  and             -> word_id  7 -> O
  referee         -> word_id  8 -> N/A
  ##d             -> word_id  8 -> N/A
  matches         -> word_id  9 -> N/A
  [SEP]           -> special 

In [8]:
# Convert to HuggingFace Dataset format
train_dataset = HFDataset.from_dict({
    'tokens': [sample['tokens'] for sample in train_data],
    'ner_tags': [sample['ner_tags'] for sample in train_data],
    'id': [sample['id'] for sample in train_data]
})

val_dataset = HFDataset.from_dict({
    'tokens': [sample['tokens'] for sample in val_data],
    'ner_tags': [sample['ner_tags'] for sample in val_data],
    'id': [sample['id'] for sample in val_data]
})

# Apply tokenization
print("Tokenizing training data...")
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
print("Tokenizing validation data...")
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)

print(f"\nTraining dataset size: {len(tokenized_train):,}")
print(f"Validation dataset size: {len(tokenized_val):,}")

# Check tokenized example
print("\nTokenized example:")
example = tokenized_train[0]
print(f"Input IDs length: {len(example['input_ids'])}")
print(f"Attention mask length: {len(example['attention_mask'])}")
print(f"Labels length: {len(example['labels'])}")
print(f"Number of -100 labels: {example['labels'].count(-100)}")

Tokenizing training data...


Map:   0%|          | 0/90320 [00:00<?, ? examples/s]

Tokenizing validation data...


Map:   0%|          | 0/10036 [00:00<?, ? examples/s]


Training dataset size: 90,320
Validation dataset size: 10,036

Tokenized example:
Input IDs length: 54
Attention mask length: 54
Labels length: 54
Number of -100 labels: 37


## 4. Set Up Training

In [9]:
def compute_metrics(p):
    """Compute entity-span level F1 score for evaluation"""
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) and convert to tags
    true_tags = []
    pred_tags = []

    for pred_seq, label_seq in zip(predictions, labels):
        true_seq = []
        pred_seq_clean = []

        for pred, label in zip(pred_seq, label_seq):
            if label != -100:  # Only evaluate non-ignored tokens
                true_seq.append(id2label[label])
                pred_seq_clean.append(id2label[pred])

        if len(true_seq) > 0:  # Only add non-empty sequences
            true_tags.append(true_seq)
            pred_tags.append(pred_seq_clean)

    # Use our entity-span evaluation
    results = evaluate_entity_spans(true_tags, pred_tags)
    return {
        "precision": results["precision"],
        "recall": results["recall"],
        "f1": results["f1"]
    }

# Data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_ner",
    learning_rate=5e-5,  # BERT recommendation
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # BERT overfits quickly
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_steps=500,
    logging_steps=100,
    save_total_limit=2,
    dataloader_num_workers=0,  # Windows compatibility
    report_to=[],  # Disable wandb/mlflow
)

print("Training configuration:")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Warmup steps: {training_args.warmup_steps}")
print(f"  Mixed precision: {training_args.fp16}")
print(f"  Max sequence length: 128")

Training configuration:
  Learning rate: 5e-05
  Batch size: 16
  Epochs: 3
  Warmup steps: 500
  Mixed precision: False
  Max sequence length: 128


## 5. Train BERT Model

In [11]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Starting BERT fine-tuning...")
print(f"Training samples: {len(tokenized_train):,}")
print(f"Validation samples: {len(tokenized_val):,}")
print(f"Device: {device}")
print("\nTraining progress:")

Starting BERT fine-tuning...
Training samples: 90,320
Validation samples: 10,036
Device: cuda

Training progress:


In [12]:
# Train the model
import time
start_time = time.time()

trainer.train()

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time/60:.2f} minutes")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1385,0.142202,0.761526,0.761187,0.761357
2,0.0948,0.13394,0.778071,0.78731,0.782663
3,0.054,0.147969,0.791234,0.797106,0.794159



Training completed in 27.32 minutes


In [13]:
# Evaluate on validation set
print("\nEvaluating BERT on validation set...")
print("=" * 60)

eval_results = trainer.evaluate()

print("\nBERT Fine-tuning Results:")
print(f"  Precision: {eval_results['eval_precision']:.4f}")
print(f"  Recall:    {eval_results['eval_recall']:.4f}")
print(f"  F1 Score:  {eval_results['eval_f1']:.4f}")
print(f"  Validation loss: {eval_results['eval_loss']:.4f}")

# Compare with expected from implementation plan
expected_f1_min = 0.90
actual_f1 = eval_results['eval_f1']

print(f"\nPerformance Analysis:")
print(f"  Expected F1 minimum: {expected_f1_min:.2f}")
print(f"  Actual F1 score:     {actual_f1:.4f}")

if actual_f1 >= expected_f1_min:
    print("  ‚úÖ F1 score meets expectations!")
else:
    print("  ‚ö†Ô∏è  F1 score below expected minimum")

# Store results for later comparison
bert_results = {
    'model': 'BERT Fine-tuned',
    'precision': eval_results['eval_precision'],
    'recall': eval_results['eval_recall'],
    'f1': eval_results['eval_f1'],
    'training_time': training_time,
    'parameters': model.num_parameters()
}


Evaluating BERT on validation set...



BERT Fine-tuning Results:
  Precision: 0.7912
  Recall:    0.7971
  F1 Score:  0.7942
  Validation loss: 0.1480

Performance Analysis:
  Expected F1 minimum: 0.90
  Actual F1 score:     0.7942
  ‚ö†Ô∏è  F1 score below expected minimum


## 6. Analyze Model Performance

In [14]:
# Get detailed predictions on validation set for analysis
print("Generating detailed predictions for analysis...")

# Get predictions
predictions = trainer.predict(tokenized_val)
preds = np.argmax(predictions.predictions, axis=2)
labels = predictions.label_ids

# Convert to tag sequences
val_sentences = [sample['tokens'] for sample in val_data]
val_true_tags = [sample['ner_tags'] for sample in val_data]
val_pred_tags = []

for i, (pred_seq, label_seq) in enumerate(zip(preds, labels)):
    pred_tags = []
    for pred, label in zip(pred_seq, label_seq):
        if label != -100:
            pred_tags.append(id2label[pred])
    val_pred_tags.append(pred_tags)

# Show examples
print("\nPrediction examples:")
for i in range(5):
    if i < len(val_sentences):
        print(f"\nExample {i+1}:")
        tokens = val_sentences[i][:15]
        true_tags = val_true_tags[i][:15]
        pred_tags = val_pred_tags[i][:15]
        
        print(f"  Tokens:    {tokens}")
        print(f"  True:      {true_tags}")
        print(f"  Predicted: {pred_tags}")
        
        # Count correct predictions
        correct = sum(1 for t, p in zip(true_tags, pred_tags) if t == p)
        total = len(true_tags)
        print(f"  Accuracy:  {correct}/{total} ({correct/total:.2%})")

Generating detailed predictions for analysis...

Prediction examples:

Example 1:
  Tokens:    ['in', '1933', 'phil', 'spitalny', 'directed', 'the', 'orchestra', 'for', 'the']
  True:      ['O', 'O', 'B-Artist', 'I-Artist', 'O', 'O', 'O', 'O', 'O']
  Predicted: ['O', 'O', 'B-Artist', 'I-Artist', 'O', 'O', 'O', 'O', 'O']
  Accuracy:  9/9 (100.00%)

Example 2:
  Tokens:    ['inside', 'the', 'vatican', 'museums', '(', 'rome', 'italy', ')']
  True:      ['O', 'O', 'B-Facility', 'I-Facility', 'O', 'O', 'O', 'O']
  Predicted: ['O', 'O', 'B-Facility', 'I-Facility', 'O', 'O', 'B-HumanSettlement', 'O']
  Accuracy:  7/8 (87.50%)

Example 3:
  Tokens:    ['alden', 'thnodup', 'namgyal', 'was', 'subsequently', 'recognised', 'as', 'the', 'reincarnate', 'leader', 'of', 'phodong', '.']
  True:      ['B-OtherPER', 'I-OtherPER', 'I-OtherPER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Facility', 'O']
  Predicted: ['B-Politician', 'I-Politician', 'I-Politician', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B

In [15]:
# Analyze entity-level performance
from collections import defaultdict

def analyze_entity_performance(true_tags_list, pred_tags_list, sentences_list):
    """Analyze performance by entity type"""
    entity_stats = defaultdict(lambda: {'tp': 0, 'fp': 0, 'fn': 0})
    
    for true_tags, pred_tags, tokens in zip(true_tags_list, pred_tags_list, sentences_list):
        true_entities = extract_entities(tokens, true_tags)
        pred_entities = extract_entities(tokens, pred_tags)
        
        true_spans = {(start, end, entity_type) for _, entity_type, start, end in true_entities}
        pred_spans = {(start, end, entity_type) for _, entity_type, start, end in pred_entities}
        
        # True positives
        for span in true_spans & pred_spans:
            entity_stats[span[2]]['tp'] += 1
        
        # False positives
        for span in pred_spans - true_spans:
            entity_stats[span[2]]['fp'] += 1
        
        # False negatives
        for span in true_spans - pred_spans:
            entity_stats[span[2]]['fn'] += 1
    
    # Calculate metrics per entity type
    results = {}
    for entity_type, stats in entity_stats.items():
        tp, fp, fn = stats['tp'], stats['fp'], stats['fn']
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        results[entity_type] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'count': tp + fn
        }
    
    return results

# Analyze by entity type
entity_results = analyze_entity_performance(val_true_tags, val_pred_tags, val_sentences)

print("\nEntity-type Performance:")
print("=" * 80)
print(f"{'Entity Type':<20} {'Count':<8} {'Prec':<8} {'Rec':<8} {'F1':<8}")
print("-" * 80)

for entity_type in sorted(entity_results.keys()):
    metrics = entity_results[entity_type]
    print(f"{entity_type:<20} {metrics['count']:<8} {metrics['precision']:<8.3f} {metrics['recall']:<8.3f} {metrics['f1']:<8.3f}")

# Show overall stats
total_entities = sum(m['count'] for m in entity_results.values())
print(f"\nTotal entities: {total_entities:,}")


Entity-type Performance:
Entity Type          Count    Prec     Rec      F1      
--------------------------------------------------------------------------------
Artist               2849     0.795    0.830    0.812   
Facility             1487     0.784    0.769    0.776   
HumanSettlement      3476     0.923    0.940    0.932   
ORG                  1893     0.748    0.763    0.756   
OtherPER             1779     0.639    0.657    0.648   
Politician           1402     0.736    0.651    0.691   
PublicCorp           589      0.724    0.742    0.733   

Total entities: 13,475


## 7. Error Analysis

In [16]:
def analyze_errors(true_tags_list, pred_tags_list, sentences_list, num_examples=5):
    """Analyze common error patterns"""
    
    false_positives = []  # Predicted entity that's not actually an entity
    false_negatives = []  # Missed entity
    wrong_type = []       # Correct span but wrong entity type
    
    for true_tags, pred_tags, tokens in zip(true_tags_list, pred_tags_list, sentences_list):
        true_entities = extract_entities(tokens, true_tags)
        pred_entities = extract_entities(tokens, pred_tags)
        
        true_spans = {(start, end, entity_type) for _, entity_type, start, end in true_entities}
        pred_spans = {(start, end, entity_type) for _, entity_type, start, end in pred_entities}
        
        # False positives: predicted but not true
        fp_spans = pred_spans - true_spans
        for start, end, entity_type in fp_spans:
            entity_text = ' '.join(tokens[start:end+1])
            false_positives.append((entity_text, entity_type, tokens))
        
        # False negatives: true but not predicted
        fn_spans = true_spans - pred_spans
        for start, end, entity_type in fn_spans:
            entity_text = ' '.join(tokens[start:end+1])
            false_negatives.append((entity_text, entity_type, tokens))
        
        # Wrong type: same span but different type
        true_span_dict = {(start, end): entity_type for start, end, entity_type in true_spans}
        pred_span_dict = {(start, end): entity_type for start, end, entity_type in pred_spans}
        
        common_spans = set(true_span_dict.keys()) & set(pred_span_dict.keys())
        for start, end in common_spans:
            true_type = true_span_dict[(start, end)]
            pred_type = pred_span_dict[(start, end)]
            if true_type != pred_type:
                entity_text = ' '.join(tokens[start:end+1])
                wrong_type.append((entity_text, true_type, pred_type, tokens))
    
    return false_positives, false_negatives, wrong_type

# Analyze errors
fp, fn, wt = analyze_errors(val_true_tags, val_pred_tags, val_sentences)

print("\nError Analysis:")
print(f"False Positives: {len(fp):,} (predicted entities that shouldn't exist)")
print(f"False Negatives: {len(fn):,} (missed entities)")
print(f"Wrong Type:      {len(wt):,} (correct span, wrong entity type)")

# Show examples of each error type
print("\nExample False Positives:")
for i, (text, pred_type, tokens) in enumerate(fp[:3]):
    print(f"  {i+1}. '{text}' ‚Üí predicted as {pred_type}")

print("\nExample False Negatives:")
for i, (text, true_type, tokens) in enumerate(fn[:3]):
    print(f"  {i+1}. '{text}' ‚Üí missed {true_type}")

print("\nExample Wrong Types:")
for i, (text, true_type, pred_type, tokens) in enumerate(wt[:3]):
    print(f"  {i+1}. '{text}' ‚Üí true: {true_type}, predicted: {pred_type}")


Error Analysis:
False Positives: 2,834 (predicted entities that shouldn't exist)
False Negatives: 2,734 (missed entities)
Wrong Type:      1,749 (correct span, wrong entity type)

Example False Positives:
  1. 'italy' ‚Üí predicted as HumanSettlement
  2. 'alden thnodup namgyal' ‚Üí predicted as Politician
  3. 'phodong' ‚Üí predicted as HumanSettlement

Example False Negatives:
  1. 'phodong' ‚Üí missed Facility
  2. 'alden thnodup namgyal' ‚Üí missed OtherPER
  3. 'ski resort' ‚Üí missed Facility

Example Wrong Types:
  1. 'alden thnodup namgyal' ‚Üí true: OtherPER, predicted: Politician
  2. 'phodong' ‚Üí true: Facility, predicted: HumanSettlement
  3. 'wilhelm ritter von leeb' ‚Üí true: OtherPER, predicted: Politician


## 8. Save Model

In [17]:
# Save the fine-tuned model
model_save_path = "./bert_ner_finetuned"

print(f"Saving model to: {model_save_path}")
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Model saved successfully!")

# Save model info
model_info = {
    'model_name': model_name,
    'num_labels': num_labels,
    'label_list': label_list,
    'label2id': label2id,
    'id2label': id2label,
    'results': bert_results,
    'training_args': {
        'learning_rate': training_args.learning_rate,
        'batch_size': training_args.per_device_train_batch_size,
        'epochs': training_args.num_train_epochs,
        'warmup_steps': training_args.warmup_steps
    }
}

with open(f"{model_save_path}/model_info.json", 'w') as f:
    json.dump(model_info, f, indent=2)

print("Model info saved!")

Saving model to: ./bert_ner_finetuned
Model saved successfully!
Model info saved!


## 9. Generate Test Predictions

In [11]:
model_save_path = "./bert_ner_finetuned"

In [17]:
# Load the fine-tuned model for inference
from transformers import pipeline

print("Loading fine-tuned model for inference...")

# Create a pipeline for token classification
ner_pipeline = pipeline(
    "token-classification",
    model=model_save_path,
    tokenizer=model_save_path,
    device=0 if device.type == 'cuda' else -1,
    aggregation_strategy=None  # We'll handle subword alignment ourselves
)

print("Pipeline loaded successfully!")

Device set to use cuda:0


Loading fine-tuned model for inference...
Pipeline loaded successfully!


In [21]:
print("Loading fine-tuned model for inference...")

# Load model + tokenizer manually
model = AutoModelForTokenClassification.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)

# üî• REQUIRED FIX
model.to(device)

print(f"Model loaded on: {device}")

def predict_bert_ner(tokens_list, model, tokenizer, batch_size=32):
    """Predict NER tags using fine-tuned BERT model"""
    predictions = []
    
    # Process in batches
    for i in range(0, len(tokens_list), batch_size):
        if i % 500 == 0:
            print(f"  Processing {i:,}/{len(tokens_list):,}")
        
        batch_tokens = tokens_list[i:i+batch_size]
        
        # Tokenize batch
        tokenized = tokenizer(
            batch_tokens,
            is_split_into_words=True,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        for k, v in tokenized.items():
            if isinstance(v, torch.Tensor):
                tokenized[k] = v.to(device)

        
        # Get predictions
        with torch.no_grad():
            outputs = model(**tokenized)
            logits = outputs.logits
            predictions_batch = torch.argmax(logits, dim=-1).cpu().numpy()
        
        # Convert predictions back to tag sequences
        for j, (tokens, pred_ids) in enumerate(zip(batch_tokens, predictions_batch)):
            word_ids = tokenized.word_ids(batch_index=j)
            pred_tags = []
            
            for k, word_id in enumerate(word_ids):
                if word_id is not None and word_id < len(tokens):
                    # Only keep prediction for first subword token
                    if k == 0 or word_ids[k-1] != word_id:
                        pred_tags.append(id2label[pred_ids[k]])
            
            predictions.append(pred_tags)
    
    return predictions

# Prepare test data
test_sentences = [sample['tokens'] for sample in test_data]

print(f"\nGenerating predictions for {len(test_sentences):,} test sentences...")
test_pred_tags = predict_bert_ner(test_sentences, model, tokenizer)

print(f"\nTest predictions complete!")
print(f"Generated predictions for {len(test_pred_tags):,} sentences")

Loading fine-tuned model for inference...
Model loaded on: cuda

Generating predictions for 5,000 test sentences...
  Processing 0/5,000


  Processing 4,000/5,000

Test predictions complete!
Generated predictions for 5,000 sentences


In [22]:
# Verify format
print("\nTest prediction examples:")
for i in range(3):
    sample = test_data[i]
    tokens = sample['tokens']
    pred_tags = test_pred_tags[i]
    print(f"\nExample {i+1} (ID: {sample['id']}):")
    print(f"  Tokens:    {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
    print(f"  Predicted: {pred_tags[:10]}{'...' if len(pred_tags) > 10 else ''}")
    print(f"  Length match: {len(tokens) == len(pred_tags)}")


Test prediction examples:

Example 1 (ID: 1):
  Tokens:    ['he', 'is', 'played', 'by', 'jared', 'harris', 'in', 'the', 'movie', 'and']...
  Predicted: ['O', 'O', 'O', 'O', 'B-Artist', 'I-Artist', 'O', 'O', 'O', 'O']...
  Length match: True

Example 2 (ID: 2):
  Tokens:    ['he', 'was', 'buried', 'in', 'the', 'bayeux', 'cathedral', '.']
  Predicted: ['O', 'O', 'O', 'O', 'O', 'B-Facility', 'I-Facility', 'O']
  Length match: True

Example 3 (ID: 3):
  Tokens:    ['tom', 'wright', 'architect', 'designed', 'the', 'burj', 'al', 'arab', 'in', 'dubai']...
  Predicted: ['B-OtherPER', 'I-OtherPER', 'O', 'O', 'O', 'B-Facility', 'I-Facility', 'I-Facility', 'O', 'B-HumanSettlement']...
  Length match: True


In [23]:
# Add predictions to test data
test_data_with_predictions = []
for sample, pred_tags in zip(test_data, test_pred_tags):
    sample_copy = sample.copy()
    sample_copy['ner_tags'] = pred_tags
    test_data_with_predictions.append(sample_copy)

# Validate predictions
print("Validating test predictions...")

validation_errors = []
for i, sample in enumerate(test_data_with_predictions):
    # Check length match
    if len(sample['tokens']) != len(sample['ner_tags']):
        validation_errors.append(f"Sample {i}: Length mismatch")
    
    # Check for valid tags
    valid_tags = set(label_list)
    for tag in sample['ner_tags']:
        if tag not in valid_tags:
            validation_errors.append(f"Sample {i}: Invalid tag '{tag}'")
            break

if validation_errors:
    print(f"Found {len(validation_errors)} validation errors:")
    for error in validation_errors[:5]:
        print(f"  - {error}")
else:
    print("‚úì All validations passed!")

# Save predictions
output_file = 'test_data_bert_predictions.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
    for sample in test_data_with_predictions:
        f.write(json.dumps(sample) + '\n')

print(f"\nSaved predictions to: {output_file}")

# Generate statistics
all_test_tags = []
for sample in test_data_with_predictions:
    all_test_tags.extend(sample['ner_tags'])

tag_counts = Counter(all_test_tags)
print(f"\nTest prediction statistics:")
print(f"  Total tokens: {len(all_test_tags):,}")
print(f"  Tag distribution:")
for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True):
    percentage = count / len(all_test_tags) * 100
    print(f"    {tag:20s}: {count:8,} ({percentage:5.2f}%)")

# Count predicted entities
test_entities = []
for sample in test_data_with_predictions:
    entities = extract_entities(sample['tokens'], sample['ner_tags'])
    test_entities.extend(entities)

entity_type_counts = Counter(entity_type for _, entity_type, _, _ in test_entities)
print(f"\nPredicted entities: {len(test_entities):,}")
print(f"  Entity type distribution:")
for entity_type, count in sorted(entity_type_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"    {entity_type:20s}: {count:6,}")

Validating test predictions...
‚úì All validations passed!

Saved predictions to: test_data_bert_predictions.jsonl

Test prediction statistics:
  Total tokens: 70,746
  Tag distribution:
    O                   :   56,813 (80.31%)
    B-HumanSettlement   :    1,771 ( 2.50%)
    I-Artist            :    1,571 ( 2.22%)
    B-Artist            :    1,523 ( 2.15%)
    I-ORG               :    1,510 ( 2.13%)
    I-OtherPER          :    1,242 ( 1.76%)
    I-Facility          :    1,049 ( 1.48%)
    B-OtherPER          :      939 ( 1.33%)
    B-ORG               :      912 ( 1.29%)
    I-HumanSettlement   :      864 ( 1.22%)
    I-Politician        :      823 ( 1.16%)
    B-Facility          :      697 ( 0.99%)
    B-Politician        :      607 ( 0.86%)
    B-PublicCorp        :      244 ( 0.34%)
    I-PublicCorp        :      181 ( 0.26%)

Predicted entities: 6,751
  Entity type distribution:
    HumanSettlement     :  1,776
    Artist              :  1,527
    OtherPER            :    945

## 10. Summary and Results

In [24]:
print("=" * 80)
print("BERT FINE-TUNED MODEL - SUMMARY")
print("=" * 80)

print(f"\nüìä Model Performance:")
print(f"   Precision: {bert_results['precision']:.4f}")
print(f"   Recall:    {bert_results['recall']:.4f}")
print(f"   F1 Score:  {bert_results['f1']:.4f}")

print(f"\nüîß Model Details:")
print(f"   Base model:           {model_name}")
print(f"   Parameters:           {bert_results['parameters']:,}")
print(f"   Number of labels:     {num_labels}")
print(f"   Training time:        {bert_results['training_time']/60:.1f} minutes")
print(f"   Training samples:     {len(tokenized_train):,}")
print(f"   Validation samples:   {len(tokenized_val):,}")

print(f"\n‚öôÔ∏è Training Configuration:")
print(f"   Learning rate:        {training_args.learning_rate}")
print(f"   Batch size:           {training_args.per_device_train_batch_size}")
print(f"   Epochs:               {training_args.num_train_epochs}")
print(f"   Warmup steps:         {training_args.warmup_steps}")
print(f"   Max sequence length:  128")
print(f"   Mixed precision:      {training_args.fp16}")

print(f"\nüìã Test Predictions:")
print(f"   Test sentences:       {len(test_data):,}")
print(f"   Predicted entities:   {len(test_entities):,}")
print(f"   Output file:          {output_file}")
print(f"   Model saved:          {model_save_path}")

print(f"\n‚úÖ Implementation Status:")
print(f"   ‚úì BERT fine-tuned for NER")
print(f"   ‚úì Handled subword tokenization alignment")
print(f"   ‚úì Evaluated with entity-span level metrics")
print(f"   ‚úì Generated test predictions")
print(f"   ‚úì All BIO sequences are valid")

# Performance analysis
expected_range = (0.90, 0.95)
actual_f1 = bert_results['f1']

print(f"\nüéØ Performance Analysis:")
print(f"   Expected F1 range:  {expected_range[0]:.2f} - {expected_range[1]:.2f}")
print(f"   Actual F1 score:    {actual_f1:.4f}")

if actual_f1 >= expected_range[0]:
    if actual_f1 <= expected_range[1]:
        print(f"   ‚úÖ Performance meets expectations!")
    else:
        print(f"   üöÄ Performance exceeds expectations!")
else:
    print(f"   ‚ö†Ô∏è  Performance below expected range")

print(f"\nüí° Key Strengths:")
print(f"   ‚Ä¢ State-of-the-art contextual embeddings")
print(f"   ‚Ä¢ Transfer learning from massive pre-training")
print(f"   ‚Ä¢ Bidirectional attention mechanism")
print(f"   ‚Ä¢ Handles unknown words via subword tokenization")
print(f"   ‚Ä¢ Strong baseline with minimal tuning")

print(f"\n‚ö†Ô∏è  Limitations:")
print(f"   ‚Ä¢ May produce invalid BIO sequences (no CRF)")
print(f"   ‚Ä¢ Longer training time compared to classical models")
print(f"   ‚Ä¢ Requires GPU for efficient training")
print(f"   ‚Ä¢ Large memory footprint for 110M parameters")

print(f"\nüîú Possible Improvements:")
print(f"   ‚Ä¢ Add CRF layer for valid BIO sequences")
print(f"   ‚Ä¢ Use larger BERT model (bert-large-cased)")
print(f"   ‚Ä¢ Implement learning rate scheduling")
print(f"   ‚Ä¢ Try different max sequence lengths")
print(f"   ‚Ä¢ Use weighted loss for class imbalance")

print("\n" + "=" * 80)
print("BERT FINE-TUNED MODEL COMPLETE!")
print("=" * 80)

BERT FINE-TUNED MODEL - SUMMARY

üìä Model Performance:


NameError: name 'bert_results' is not defined