# Improved Hybrid Ensemble - Validation Set

This notebook:
1. Loads validation predictions from both models (yours and friend's)
2. Implements improved ensemble strategies (union, intersection, voting, hybrid)
3. Evaluates each strategy on validation set
4. Compares ensemble performance vs individual models
5. Finds the best strategy per entity type

In [36]:
import json
from collections import defaultdict
from typing import List, Set, Tuple
import utils

print("‚úì Imports loaded")

‚úì Imports loaded


## Load Validation Ground Truth and Predictions

In [37]:
# Load ground truth from validation set
ground_truth = {}
tokens_dict = {}

with open('val_split.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        ground_truth[ex['id']] = ex['ner_tags']
        tokens_dict[ex['id']] = ex['tokens']

print(f"Loaded {len(ground_truth)} ground truth examples")

Loaded 10036 ground truth examples


In [38]:
# Load your model predictions (KB-NER)
your_predictions = {}
with open('val_predictions_xlmr_crf.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        your_predictions[ex['id']] = ex['ner_tags']

print(f"Loaded {len(your_predictions)} predictions from your model (KB-NER)")

# Load friend's model predictions (RoBERTa)
friend_predictions = {}
with open('val_data_roberta_predictions.jsonl', 'r') as f:
    for line in f:
        ex = json.loads(line)
        friend_predictions[ex['id']] = ex['ner_tags']

print(f"Loaded {len(friend_predictions)} predictions from friend's model (RoBERTa)")

Loaded 10036 predictions from your model (KB-NER)
Loaded 10036 predictions from friend's model (RoBERTa)


In [39]:
# Get aligned IDs (examples present in all three)
aligned_ids = sorted(set(ground_truth.keys()) & set(your_predictions.keys()) & set(friend_predictions.keys()))

print(f"\nAligned examples across all datasets: {len(aligned_ids)}")


Aligned examples across all datasets: 10036


## Evaluate Individual Models on Validation Set

In [40]:
# Evaluate your model
your_gt = [ground_truth[id] for id in aligned_ids]
your_pred = [your_predictions[id] for id in aligned_ids]
your_tokens = [tokens_dict[id] for id in aligned_ids]

your_results = utils.evaluate_entity_spans(your_gt, your_pred, your_tokens)

print("Your Model (KB-NER) on Validation:")
print(f"  Precision: {your_results['precision']:.4f}")
print(f"  Recall:    {your_results['recall']:.4f}")
print(f"  F1:        {your_results['f1']:.4f}")

Your Model (KB-NER) on Validation:
  Precision: 0.8249
  Recall:    0.8275
  F1:        0.8262


In [41]:
# Evaluate friend's model
friend_gt = [ground_truth[id] for id in aligned_ids]
friend_pred = [friend_predictions[id] for id in aligned_ids]
friend_tokens = [tokens_dict[id] for id in aligned_ids]

friend_results = utils.evaluate_entity_spans(friend_gt, friend_pred, friend_tokens)

print("\nFriend's Model (RoBERTa) on Validation:")
print(f"  Precision: {friend_results['precision']:.4f}")
print(f"  Recall:    {friend_results['recall']:.4f}")
print(f"  F1:        {friend_results['f1']:.4f}")


Friend's Model (RoBERTa) on Validation:
  Precision: 0.7984
  Recall:    0.8056
  F1:        0.8020


## Get Entity Types

In [42]:
# Extract all entity types from ground truth
entity_types = set()
for tags in ground_truth.values():
    for tag in tags:
        if tag != 'O' and tag.startswith('B-'):
            entity_types.add(tag[2:])

entity_types = sorted(entity_types)
print(f"Entity types: {entity_types}")
print(f"Total: {len(entity_types)} types")

Entity types: ['Artist', 'Facility', 'HumanSettlement', 'ORG', 'OtherPER', 'Politician', 'PublicCorp']
Total: 7 types


## Implement Ensemble Strategies

In [43]:
def union_ensemble(your_tags, friend_tags, tokens):
    """Take ALL entities from both models - maximizes recall"""
    your_entities = utils.extract_entities(tokens, your_tags)
    friend_entities = utils.extract_entities(tokens, friend_tags)
    
    # Combine all entities (using set to avoid duplicates)
    all_entities = set()
    for entity in your_entities:
        # entity = (text, type, start, end)
        all_entities.add((entity[2], entity[3], entity[1]))  # (start, end, type)
    for entity in friend_entities:
        all_entities.add((entity[2], entity[3], entity[1]))
    
    # Convert back to BIO tags
    ensemble_tags = ['O'] * len(tokens)
    for start, end, entity_type in all_entities:
        ensemble_tags[start] = f'B-{entity_type}'
        for i in range(start + 1, end + 1):
            ensemble_tags[i] = f'I-{entity_type}'
    
    return ensemble_tags


def intersection_ensemble(your_tags, friend_tags, tokens):
    """Only take entities BOTH models agree on - maximizes precision"""
    your_entities = utils.extract_entities(tokens, your_tags)
    friend_entities = utils.extract_entities(tokens, friend_tags)
    
    your_set = set((e[2], e[3], e[1]) for e in your_entities)
    friend_set = set((e[2], e[3], e[1]) for e in friend_entities)
    
    # Only keep entities both agree on
    agreed_entities = your_set & friend_set
    
    # Convert back to BIO tags
    ensemble_tags = ['O'] * len(tokens)
    for start, end, entity_type in agreed_entities:
        ensemble_tags[start] = f'B-{entity_type}'
        for i in range(start + 1, end + 1):
            ensemble_tags[i] = f'I-{entity_type}'
    
    return ensemble_tags


def voting_ensemble(your_tags, friend_tags, tokens):
    """Token-level majority voting"""
    ensemble_tags = []
    
    for y_tag, f_tag in zip(your_tags, friend_tags):
        if y_tag == f_tag:
            ensemble_tags.append(y_tag)
        elif y_tag == 'O':
            ensemble_tags.append(f_tag)
        elif f_tag == 'O':
            ensemble_tags.append(y_tag)
        else:
            # Both predict entity but disagree on type - take your model's prediction
            ensemble_tags.append(y_tag)
    
    return ensemble_tags


def hybrid_ensemble(your_tags, friend_tags, tokens, strategy_map):
    """
    Use different ensemble strategy for each entity type.
    strategy_map: dict mapping entity_type -> strategy ('union', 'intersection', 'your', 'friend')
    """
    your_entities = utils.extract_entities(tokens, your_tags)
    friend_entities = utils.extract_entities(tokens, friend_tags)
    
    selected_entities = set()
    
    # Process each entity type with its best strategy
    for entity_type in entity_types:
        your_type_entities = set((e[2], e[3], e[1]) for e in your_entities if e[1] == entity_type)
        friend_type_entities = set((e[2], e[3], e[1]) for e in friend_entities if e[1] == entity_type)
        
        strategy = strategy_map.get(entity_type, 'union')
        
        if strategy == 'union':
            selected_entities.update(your_type_entities | friend_type_entities)
        elif strategy == 'intersection':
            selected_entities.update(your_type_entities & friend_type_entities)
        elif strategy == 'your':
            selected_entities.update(your_type_entities)
        elif strategy == 'friend':
            selected_entities.update(friend_type_entities)
    
    # Convert back to BIO tags
    ensemble_tags = ['O'] * len(tokens)
    for start, end, entity_type in selected_entities:
        ensemble_tags[start] = f'B-{entity_type}'
        for i in range(start + 1, end + 1):
            ensemble_tags[i] = f'I-{entity_type}'
    
    return ensemble_tags

print("‚úì Ensemble strategies defined")

‚úì Ensemble strategies defined


## Test All Strategies

In [44]:
# Test union strategy
print("Testing UNION ensemble...")
union_predictions = []
for id in aligned_ids:
    ensemble_tags = union_ensemble(
        your_predictions[id],
        friend_predictions[id],
        tokens_dict[id]
    )
    union_predictions.append(ensemble_tags)

union_results = utils.evaluate_entity_spans(your_gt, union_predictions, your_tokens)
print(f"  F1: {union_results['f1']:.4f}")

Testing UNION ensemble...
  F1: 0.8074


In [45]:
# Test intersection strategy
print("\nTesting INTERSECTION ensemble...")
intersection_predictions = []
for id in aligned_ids:
    ensemble_tags = intersection_ensemble(
        your_predictions[id],
        friend_predictions[id],
        tokens_dict[id]
    )
    intersection_predictions.append(ensemble_tags)

intersection_results = utils.evaluate_entity_spans(your_gt, intersection_predictions, your_tokens)
print(f"  F1: {intersection_results['f1']:.4f}")


Testing INTERSECTION ensemble...
  F1: 0.8172


In [46]:
# Test voting strategy
print("\nTesting VOTING ensemble...")
voting_predictions = []
for id in aligned_ids:
    ensemble_tags = voting_ensemble(
        your_predictions[id],
        friend_predictions[id],
        tokens_dict[id]
    )
    voting_predictions.append(ensemble_tags)

voting_results = utils.evaluate_entity_spans(your_gt, voting_predictions, your_tokens)
print(f"  F1: {voting_results['f1']:.4f}")


Testing VOTING ensemble...
  F1: 0.8150


## Find Best Strategy Per Entity Type

In [47]:
print("\nFinding best strategy for each entity type...\n")

best_strategy_per_type = {}

for entity_type in entity_types:
    print(f"Testing strategies for {entity_type}...")
    
    type_results = {}
    
    # Test each strategy for this entity type
    for strategy_name in ['union', 'intersection', 'your', 'friend']:
        # Create temporary strategy map with all types using this strategy
        temp_strategy_map = {entity_type: strategy_name}
        # For other types, use union as default
        for other_type in entity_types:
            if other_type != entity_type:
                temp_strategy_map[other_type] = 'union'
        
        # Generate predictions
        temp_predictions = []
        for id in aligned_ids:
            ensemble_tags = hybrid_ensemble(
                your_predictions[id],
                friend_predictions[id],
                tokens_dict[id],
                temp_strategy_map
            )
            temp_predictions.append(ensemble_tags)
        
        # Calculate F1 for this entity type only
        tp = fp = fn = 0
        for gt_tags, pred_tags, tokens in zip(your_gt, temp_predictions, your_tokens):
            gt_spans = utils.get_entity_spans(tokens, gt_tags)
            pred_spans = utils.get_entity_spans(tokens, pred_tags)
            
            # Filter for current entity type
            gt_type_spans = {(s, e, t) for s, e, t in gt_spans if t == entity_type}
            pred_type_spans = {(s, e, t) for s, e, t in pred_spans if t == entity_type}
            
            tp += len(gt_type_spans & pred_type_spans)
            fp += len(pred_type_spans - gt_type_spans)
            fn += len(gt_type_spans - pred_type_spans)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        type_results[strategy_name] = f1
        print(f"  {strategy_name:12} -> F1: {f1:.4f}")
    
    # Pick best strategy for this type
    best = max(type_results.items(), key=lambda x: x[1])
    best_strategy_per_type[entity_type] = best[0]
    print(f"  ‚úì Best: {best[0]} (F1: {best[1]:.4f})\n")

print("\n" + "="*60)
print("BEST STRATEGY PER ENTITY TYPE")
print("="*60)
for entity_type, strategy in sorted(best_strategy_per_type.items()):
    print(f"  {entity_type:20} -> {strategy}")
print("="*60)


Finding best strategy for each entity type...

Testing strategies for Artist...
  union        -> F1: 0.8201
  intersection -> F1: 0.8214
  your         -> F1: 0.8268
  friend       -> F1: 0.8166
  ‚úì Best: your (F1: 0.8268)

Testing strategies for Facility...
  union        -> F1: 0.8140
  intersection -> F1: 0.8245
  your         -> F1: 0.8440
  friend       -> F1: 0.8003
  ‚úì Best: your (F1: 0.8440)

Testing strategies for HumanSettlement...
  union        -> F1: 0.9399
  intersection -> F1: 0.9505
  your         -> F1: 0.9539
  friend       -> F1: 0.9381
  ‚úì Best: your (F1: 0.9539)

Testing strategies for ORG...
  union        -> F1: 0.7824
  intersection -> F1: 0.8096
  your         -> F1: 0.8166
  friend       -> F1: 0.7763
  ‚úì Best: your (F1: 0.8166)

Testing strategies for OtherPER...
  union        -> F1: 0.6413
  intersection -> F1: 0.6439
  your         -> F1: 0.6500
  friend       -> F1: 0.6349
  ‚úì Best: your (F1: 0.6500)

Testing strategies for Politician...
  uni

## Generate Hybrid Ensemble Predictions

In [48]:
print("\nGenerating HYBRID ensemble predictions...")

hybrid_predictions = []
for id in aligned_ids:
    ensemble_tags = hybrid_ensemble(
        your_predictions[id],
        friend_predictions[id],
        tokens_dict[id],
        best_strategy_per_type
    )
    hybrid_predictions.append(ensemble_tags)

hybrid_results = utils.evaluate_entity_spans(your_gt, hybrid_predictions, your_tokens)
print(f"  F1: {hybrid_results['f1']:.4f}")


Generating HYBRID ensemble predictions...
  F1: 0.8260


## Compare All Approaches

In [49]:
print("\n" + "="*80)
print("VALIDATION SET PERFORMANCE COMPARISON")
print("="*80)
print(f"{'Method':<30} {'Precision':>12} {'Recall':>12} {'F1':>12}")
print("-"*80)

all_results = [
    ('Your Model (KB-NER)', your_results),
    ('Friend Model (RoBERTa)', friend_results),
    ('Union Ensemble', union_results),
    ('Intersection Ensemble', intersection_results),
    ('Voting Ensemble', voting_results),
    ('Hybrid Ensemble', hybrid_results)
]

# Sort by F1
all_results.sort(key=lambda x: x[1]['f1'], reverse=True)

for i, (name, results) in enumerate(all_results, 1):
    marker = "üèÜ" if i == 1 else f"{i}."
    print(f"{marker} {name:<27} {results['precision']:>12.4f} {results['recall']:>12.4f} {results['f1']:>12.4f}")

print("="*80)


VALIDATION SET PERFORMANCE COMPARISON
Method                            Precision       Recall           F1
--------------------------------------------------------------------------------
üèÜ Your Model (KB-NER)               0.8249       0.8275       0.8262
2. Hybrid Ensemble                   0.8242       0.8278       0.8260
3. Intersection Ensemble             0.8990       0.7490       0.8172
4. Voting Ensemble                   0.8024       0.8281       0.8150
5. Union Ensemble                    0.7939       0.8214       0.8074
6. Friend Model (RoBERTa)            0.7984       0.8056       0.8020


## Save Best Ensemble Predictions

In [50]:
# Determine which ensemble performed best
ensemble_scores = {
    'union': union_results['f1'],
    'intersection': intersection_results['f1'],
    'voting': voting_results['f1'],
    'hybrid': hybrid_results['f1']
}

best_ensemble = max(ensemble_scores.items(), key=lambda x: x[1])

print(f"\nüèÜ Best Ensemble Strategy: {best_ensemble[0].upper()}")
print(f"   F1 Score: {best_ensemble[1]:.4f}")

# Save the best ensemble predictions
if best_ensemble[0] == 'union':
    best_predictions = union_predictions
elif best_ensemble[0] == 'intersection':
    best_predictions = intersection_predictions
elif best_ensemble[0] == 'voting':
    best_predictions = voting_predictions
else:
    best_predictions = hybrid_predictions

output_file = 'val_predictions_ensemble_improved.jsonl'
with open(output_file, 'w') as f:
    for id, pred_tags in zip(aligned_ids, best_predictions):
        output = {
            'id': id,
            'tokens': tokens_dict[id],
            'ner_tags': pred_tags
        }
        f.write(json.dumps(output) + '\n')

print(f"\n‚úì Saved best ensemble predictions to {output_file}")


üèÜ Best Ensemble Strategy: HYBRID
   F1 Score: 0.8260

‚úì Saved best ensemble predictions to val_predictions_ensemble_improved.jsonl


## Save Strategy Configuration

In [51]:
# Save the strategy configuration for future reference
strategy_config = {
    'best_overall_ensemble': best_ensemble[0],
    'best_strategy_per_type': best_strategy_per_type,
    'results': {
        'your_model': {
            'precision': your_results['precision'],
            'recall': your_results['recall'],
            'f1': your_results['f1']
        },
        'friend_model': {
            'precision': friend_results['precision'],
            'recall': friend_results['recall'],
            'f1': friend_results['f1']
        },
        'union': {
            'precision': union_results['precision'],
            'recall': union_results['recall'],
            'f1': union_results['f1']
        },
        'intersection': {
            'precision': intersection_results['precision'],
            'recall': intersection_results['recall'],
            'f1': intersection_results['f1']
        },
        'voting': {
            'precision': voting_results['precision'],
            'recall': voting_results['recall'],
            'f1': voting_results['f1']
        },
        'hybrid': {
            'precision': hybrid_results['precision'],
            'recall': hybrid_results['recall'],
            'f1': hybrid_results['f1']
        }
    }
}

with open('val_ensemble_strategy.json', 'w') as f:
    json.dump(strategy_config, f, indent=2)

print("\n‚úì Saved strategy configuration to val_ensemble_strategy.json")


‚úì Saved strategy configuration to val_ensemble_strategy.json


## Recommendations

In [52]:
print("\n" + "="*80)
print("RECOMMENDATIONS")
print("="*80)

# Compare best ensemble vs best individual
best_individual_f1 = max(your_results['f1'], friend_results['f1'])
best_individual_name = 'Your Model (KB-NER)' if your_results['f1'] > friend_results['f1'] else 'Friend Model (RoBERTa)'

best_ensemble_f1 = best_ensemble[1]

improvement = best_ensemble_f1 - best_individual_f1

if improvement > 0.001:  # 0.1% improvement
    print(f"\n‚úÖ USE ENSEMBLE APPROACH")
    print(f"   Best Ensemble: {best_ensemble[0].upper()} (F1: {best_ensemble_f1:.4f})")
    print(f"   Best Individual: {best_individual_name} (F1: {best_individual_f1:.4f})")
    print(f"   Improvement: +{improvement:.4f} ({improvement*100:.2f}%)")
    print(f"\n   üìÅ Use predictions from: val_predictions_ensemble_improved.jsonl")
elif improvement < -0.001:
    print(f"\n‚ö†Ô∏è  USE INDIVIDUAL MODEL")
    print(f"   Best Individual: {best_individual_name} (F1: {best_individual_f1:.4f})")
    print(f"   Best Ensemble: {best_ensemble[0].upper()} (F1: {best_ensemble_f1:.4f})")
    print(f"   Ensemble performs worse by: {abs(improvement):.4f} ({abs(improvement)*100:.2f}%)")
else:
    print(f"\nü§ù ENSEMBLE AND INDIVIDUAL PERFORM SIMILARLY")
    print(f"   Best Individual: {best_individual_name} (F1: {best_individual_f1:.4f})")
    print(f"   Best Ensemble: {best_ensemble[0].upper()} (F1: {best_ensemble_f1:.4f})")
    print(f"   Difference: {improvement:.4f}")
    print(f"   Either approach is viable.")

print("\n" + "="*80)


RECOMMENDATIONS

ü§ù ENSEMBLE AND INDIVIDUAL PERFORM SIMILARLY
   Best Individual: Your Model (KB-NER) (F1: 0.8262)
   Best Ensemble: HYBRID (F1: 0.8260)
   Difference: -0.0002
   Either approach is viable.

