# Detailed Validation Set Comparison

Shows detailed per-entity-type results for:
- Your model (KB-NER)
- Friend's model (RoBERTa)
- Hybrid Ensemble

In [13]:
import json
from collections import defaultdict
import utils

print("‚úì Imports loaded")

‚úì Imports loaded


## Load Data

In [14]:
# Load ground truth
ground_truth = {}
tokens_dict = {}

with open('val_split.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        ground_truth[ex['id']] = ex['ner_tags']
        tokens_dict[ex['id']] = ex['tokens']

print(f"Loaded {len(ground_truth)} ground truth examples")

Loaded 10036 ground truth examples


In [15]:
# Load predictions from all three models
your_predictions = {}
with open('val_predictions_xlmr_crf.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        your_predictions[ex['id']] = ex['ner_tags']

friend_predictions = {}
with open('val_data_roberta_predictions.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        friend_predictions[ex['id']] = ex['ner_tags']

ensemble_predictions = {}
with open('val_predictions_ensemble_improved.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        ensemble_predictions[ex['id']] = ex['ner_tags']

print(f"Loaded predictions from all models")

Loaded predictions from all models


In [16]:
# Get aligned IDs
aligned_ids = sorted(set(ground_truth.keys()) & 
                     set(your_predictions.keys()) & 
                     set(friend_predictions.keys()) & 
                     set(ensemble_predictions.keys()))

print(f"Aligned examples: {len(aligned_ids)}")

Aligned examples: 10036


## Prepare Data for Evaluation

In [17]:
# Prepare lists for evaluation
gt_tags_list = [ground_truth[id] for id in aligned_ids]
tokens_list = [tokens_dict[id] for id in aligned_ids]

your_pred_list = [your_predictions[id] for id in aligned_ids]
friend_pred_list = [friend_predictions[id] for id in aligned_ids]
ensemble_pred_list = [ensemble_predictions[id] for id in aligned_ids]

print("‚úì Prepared data for evaluation")

‚úì Prepared data for evaluation


## Your Model (KB-NER) - Detailed Report

In [18]:
print("\n" + "="*80)
print("YOUR MODEL: Knowledge-Augmented XLM-RoBERTa--CRF")
print("="*80)

utils.print_evaluation_report(gt_tags_list, your_pred_list, tokens_list, "Your Model (KB-NER)")


YOUR MODEL: Knowledge-Augmented XLM-RoBERTa--CRF
ENTITY-SPAN LEVEL EVALUATION REPORT: Your Model (KB-NER)

OVERALL METRICS:
  Precision: 0.8249
  Recall:    0.8275
  F1 Score:  0.8262

  True Positives:  11151
  False Positives: 2367
  False Negatives: 2324

--------------------------------------------------------------------------------
PER-ENTITY-TYPE METRICS:
--------------------------------------------------------------------------------
Entity Type          Precision    Recall       F1           Support   
--------------------------------------------------------------------------------
Artist               0.8154       0.8420       0.8285       2849      
Facility             0.8428       0.8440       0.8434       1487      
HumanSettlement      0.9575       0.9586       0.9580       3476      
ORG                  0.8176       0.8193       0.8185       1893      
OtherPER             0.6540       0.6863       0.6698       1779      
Politician           0.7516       0.6690      

## Friend's Model (RoBERTa) - Detailed Report

In [19]:
print("\n" + "="*80)
print("FRIEND'S MODEL: RoBERTa-based NER")
print("="*80)

utils.print_evaluation_report(gt_tags_list, friend_pred_list, tokens_list, "Friend's Model (RoBERTa)")


FRIEND'S MODEL: RoBERTa-based NER
ENTITY-SPAN LEVEL EVALUATION REPORT: Friend's Model (RoBERTa)

OVERALL METRICS:
  Precision: 0.7984
  Recall:    0.8056
  F1 Score:  0.8020

  True Positives:  10856
  False Positives: 2742
  False Negatives: 2619

--------------------------------------------------------------------------------
PER-ENTITY-TYPE METRICS:
--------------------------------------------------------------------------------
Entity Type          Precision    Recall       F1           Support   
--------------------------------------------------------------------------------
Artist               0.8009       0.8287       0.8146       2849      
Facility             0.8121       0.7848       0.7982       1487      
HumanSettlement      0.9297       0.9436       0.9366       3476      
ORG                  0.7499       0.7950       0.7718       1893      
OtherPER             0.6446       0.6565       0.6505       1779      
Politician           0.7218       0.6755       0.6979   

## Hybrid Ensemble - Detailed Report

In [20]:
print("\n" + "="*80)
print("HYBRID ENSEMBLE")
print("="*80)

utils.print_evaluation_report(gt_tags_list, ensemble_pred_list, tokens_list, "Hybrid Ensemble")


HYBRID ENSEMBLE
ENTITY-SPAN LEVEL EVALUATION REPORT: Hybrid Ensemble

OVERALL METRICS:
  Precision: 0.8242
  Recall:    0.8278
  F1 Score:  0.8260

  True Positives:  11155
  False Positives: 2379
  False Negatives: 2320

--------------------------------------------------------------------------------
PER-ENTITY-TYPE METRICS:
--------------------------------------------------------------------------------
Entity Type          Precision    Recall       F1           Support   
--------------------------------------------------------------------------------
Artist               0.8154       0.8417       0.8283       2849      
Facility             0.8434       0.8440       0.8437       1487      
HumanSettlement      0.9577       0.9583       0.9580       3476      
ORG                  0.8190       0.8151       0.8171       1893      
OtherPER             0.6540       0.6863       0.6698       1779      
Politician           0.7522       0.6690       0.7082       1402      
PublicCorp  

## Side-by-Side Comparison

In [21]:
# Get all entity types
entity_types = set()
for tags in ground_truth.values():
    for tag in tags:
        if tag != 'O' and tag.startswith('B-'):
            entity_types.add(tag[2:])

entity_types = sorted(entity_types)

# Calculate per-entity-type metrics for all models
def calculate_per_type_metrics(gt_tags_list, pred_tags_list, tokens_list, entity_types):
    type_metrics = {}
    
    for entity_type in entity_types:
        tp = fp = fn = 0
        
        for gt_tags, pred_tags, tokens in zip(gt_tags_list, pred_tags_list, tokens_list):
            gt_spans = utils.get_entity_spans(tokens, gt_tags)
            pred_spans = utils.get_entity_spans(tokens, pred_tags)
            
            # Filter for current entity type
            gt_type_spans = {(s, e, t) for s, e, t in gt_spans if t == entity_type}
            pred_type_spans = {(s, e, t) for s, e, t in pred_spans if t == entity_type}
            
            tp += len(gt_type_spans & pred_type_spans)
            fp += len(pred_type_spans - gt_type_spans)
            fn += len(gt_type_spans - pred_type_spans)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        type_metrics[entity_type] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': tp + fn
        }
    
    return type_metrics

your_type_metrics = calculate_per_type_metrics(gt_tags_list, your_pred_list, tokens_list, entity_types)
friend_type_metrics = calculate_per_type_metrics(gt_tags_list, friend_pred_list, tokens_list, entity_types)
ensemble_type_metrics = calculate_per_type_metrics(gt_tags_list, ensemble_pred_list, tokens_list, entity_types)

print("‚úì Calculated per-type metrics for all models")

‚úì Calculated per-type metrics for all models


In [22]:
# Display side-by-side comparison
print("\n" + "="*120)
print("SIDE-BY-SIDE COMPARISON: F1 SCORES PER ENTITY TYPE")
print("="*120)
print(f"{'Entity Type':<20} {'Your Model':>15} {'Friend Model':>15} {'Hybrid Ensemble':>18} {'Best':>12} {'Improvement':>15}")
print("-"*120)

for entity_type in entity_types:
    your_f1 = your_type_metrics[entity_type]['f1']
    friend_f1 = friend_type_metrics[entity_type]['f1']
    ensemble_f1 = ensemble_type_metrics[entity_type]['f1']
    support = your_type_metrics[entity_type]['support']
    
    # Determine best individual model
    best_individual_f1 = max(your_f1, friend_f1)
    best_individual_name = 'Your' if your_f1 > friend_f1 else 'Friend'
    
    # Calculate improvement
    improvement = ensemble_f1 - best_individual_f1
    
    # Determine overall best
    best_overall = max(your_f1, friend_f1, ensemble_f1)
    if best_overall == ensemble_f1:
        best_marker = 'üèÜ Ensemble'
    elif best_overall == your_f1:
        best_marker = 'Your'
    else:
        best_marker = 'Friend'
    
    improvement_str = f"{improvement:+.4f}" if improvement != 0 else "0.0000"
    
    print(f"{entity_type:<20} {your_f1:>15.4f} {friend_f1:>15.4f} {ensemble_f1:>18.4f} {best_marker:>12} {improvement_str:>15}")
    print(f"{'':20} {'':15} {'':15} {'':18} {'':12} (n={support})")

print("="*120)


SIDE-BY-SIDE COMPARISON: F1 SCORES PER ENTITY TYPE
Entity Type               Your Model    Friend Model    Hybrid Ensemble         Best     Improvement
------------------------------------------------------------------------------------------------------------------------
Artist                        0.8285          0.8146             0.8283         Your         -0.0002
                                                                                     (n=2849)
Facility                      0.8434          0.7982             0.8437   üèÜ Ensemble         +0.0003
                                                                                     (n=1487)
HumanSettlement               0.9580          0.9366             0.9580         Your         -0.0000
                                                                                     (n=3476)
ORG                           0.8185          0.7718             0.8171         Your         -0.0014
                                     

## Overall Comparison Summary

In [23]:
# Calculate overall metrics
your_overall = utils.evaluate_entity_spans(gt_tags_list, your_pred_list, tokens_list)
friend_overall = utils.evaluate_entity_spans(gt_tags_list, friend_pred_list, tokens_list)
ensemble_overall = utils.evaluate_entity_spans(gt_tags_list, ensemble_pred_list, tokens_list)

print("\n" + "="*100)
print("OVERALL METRICS COMPARISON")
print("="*100)
print(f"{'Model':<35} {'Precision':>15} {'Recall':>15} {'F1':>15}")
print("-"*100)

all_models = [
    ('Your Model (KB-NER)', your_overall),
    ('Friend Model (RoBERTa)', friend_overall),
    ('Hybrid Ensemble', ensemble_overall)
]

# Sort by F1
all_models.sort(key=lambda x: x[1]['f1'], reverse=True)

for i, (name, metrics) in enumerate(all_models, 1):
    marker = "üèÜ" if i == 1 else f"{i}."
    print(f"{marker} {name:<32} {metrics['precision']:>15.4f} {metrics['recall']:>15.4f} {metrics['f1']:>15.4f}")

print("="*100)

# Show improvement
best_individual_f1 = max(your_overall['f1'], friend_overall['f1'])
ensemble_f1 = ensemble_overall['f1']
improvement = ensemble_f1 - best_individual_f1

print(f"\nEnsemble vs Best Individual Model:")
print(f"  Improvement: {improvement:+.4f} ({improvement*100:+.2f}%)")

if improvement > 0:
    print(f"  ‚úÖ Ensemble outperforms both individual models")
elif improvement < 0:
    print(f"  ‚ö†Ô∏è  Best individual model outperforms ensemble")
else:
    print(f"  ü§ù Ensemble performs similarly to best individual model")


OVERALL METRICS COMPARISON
Model                                     Precision          Recall              F1
----------------------------------------------------------------------------------------------------
üèÜ Your Model (KB-NER)                       0.8249          0.8275          0.8262
2. Hybrid Ensemble                           0.8242          0.8278          0.8260
3. Friend Model (RoBERTa)                    0.7984          0.8056          0.8020

Ensemble vs Best Individual Model:
  Improvement: -0.0002 (-0.02%)
  ‚ö†Ô∏è  Best individual model outperforms ensemble


## Entity Types Where Ensemble Helps Most

In [24]:
# Calculate improvement per entity type
improvements = []

for entity_type in entity_types:
    your_f1 = your_type_metrics[entity_type]['f1']
    friend_f1 = friend_type_metrics[entity_type]['f1']
    ensemble_f1 = ensemble_type_metrics[entity_type]['f1']
    
    best_individual_f1 = max(your_f1, friend_f1)
    improvement = ensemble_f1 - best_individual_f1
    
    improvements.append((entity_type, improvement, ensemble_f1, best_individual_f1))

# Sort by improvement (largest first)
improvements.sort(key=lambda x: x[1], reverse=True)

print("\n" + "="*80)
print("ENTITY TYPES WHERE ENSEMBLE HELPS MOST")
print("="*80)
print(f"{'Entity Type':<20} {'Improvement':>15} {'Best Individual':>18} {'Ensemble':>15}")
print("-"*80)

for entity_type, improvement, ensemble_f1, best_individual_f1 in improvements:
    marker = "‚úÖ" if improvement > 0.001 else ("‚ö†Ô∏è" if improvement < -0.001 else "‚ûñ")
    print(f"{marker} {entity_type:<18} {improvement:>15.4f} {best_individual_f1:>18.4f} {ensemble_f1:>15.4f}")

print("="*80)


ENTITY TYPES WHERE ENSEMBLE HELPS MOST
Entity Type              Improvement    Best Individual        Ensemble
--------------------------------------------------------------------------------
‚úÖ PublicCorp                  0.0015             0.7686          0.7701
‚ûñ Facility                    0.0003             0.8434          0.8437
‚ûñ Politician                  0.0003             0.7079          0.7082
‚ûñ OtherPER                    0.0000             0.6698          0.6698
‚ûñ HumanSettlement            -0.0000             0.9580          0.9580
‚ûñ Artist                     -0.0002             0.8285          0.8283
‚ö†Ô∏è ORG                        -0.0014             0.8185          0.8171
