In [None]:
import sys
sys.path.append('..')

import torch
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter, defaultdict

from config import Config
from training.evaluate import BertQAEvaluator
from data.preprocessing import compute_exact_match, compute_f1

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

## 1. Load Trained Model

In [None]:
# Load configuration
config = Config()

# Path to best model
best_model_path = '../checkpoints/best_model.pt'

# Check if model exists
if Path(best_model_path).exists():
    print(f"✓ Found model at {best_model_path}")
    
    # Load checkpoint info
    checkpoint = torch.load(best_model_path, map_location='cpu')
    print(f"\nModel Info:")
    print(f"  Epoch: {checkpoint.get('epoch', 'N/A')}")
    print(f"  Global Step: {checkpoint.get('global_step', 'N/A')}")
    print(f"  Best Eval Loss: {checkpoint.get('best_eval_loss', 'N/A'):.4f}")
else:
    print(f"❌ Model not found at {best_model_path}")
    print("Please train the model first using notebooks/04_model_training.ipynb")

In [None]:
# Initialize evaluator
evaluator = BertQAEvaluator(best_model_path, config)
print("✓ Evaluator ready")

## 2. Quick Test on Sample Questions

In [None]:
# Test with sample questions
test_samples = [
    {
        "context": """The Amazon rainforest, also known as Amazonia, is a moist broadleaf tropical 
        rainforest in the Amazon biome that covers most of the Amazon basin of South America. 
        This basin encompasses 7,000,000 km2, of which 5,500,000 km2 are covered by the rainforest. 
        The majority of the forest is contained within Brazil, with 60% of the rainforest.""",
        "questions": [
            "What is another name for the Amazon rainforest?",
            "How much of the Amazon is in Brazil?",
            "What percentage of the rainforest is in Brazil?"
        ]
    },
    {
        "context": """The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. 
        It is named after the engineer Gustave Eiffel, whose company designed and built the tower. 
        Constructed from 1887 to 1889, it was initially criticized but has become a global cultural 
        icon of France and one of the most recognizable structures in the world.""",
        "questions": [
            "Who is the Eiffel Tower named after?",
            "When was the Eiffel Tower constructed?",
            "Where is the Eiffel Tower located?"
        ]
    }
]

print("Testing on Sample Questions:")
print("="*80)

for sample_idx, sample in enumerate(test_samples, 1):
    print(f"\n{'='*80}")
    print(f"Sample {sample_idx}")
    print(f"{'='*80}")
    print(f"Context: {sample['context'][:100]}...\n")
    
    for q_idx, question in enumerate(sample['questions'], 1):
        result = evaluator.predict(question, sample['context'])
        print(f"{q_idx}. Q: {question}")
        print(f"   A: {result['text']}")
        print(f"   Confidence: {result['score']:.2f}\n")

## 3. Evaluate on Small Dev Subset (Quick Test)

In [None]:
# Quick evaluation on 100 samples
print("Running quick evaluation on 100 samples...\n")

quick_results = evaluator.evaluate_dataset(
    '../archive/dev-v1.1.json',
    max_samples=100
)

print(f"\nQuick Test Results (100 samples):")
print(f"  Exact Match: {quick_results['exact_match']:.2f}%")
print(f"  F1 Score:    {quick_results['f1']:.2f}%")

## 4. Full Evaluation on Dev Set

**Note:** This will take 15-30 minutes depending on your hardware.

In [None]:
# Full evaluation on entire dev set
print("Running full evaluation on dev set...")
print("This may take 15-30 minutes...\n")

full_results = evaluator.evaluate_dataset('../archive/dev-v1.1.json')

# Save predictions
evaluator.save_predictions(
    full_results['predictions'],
    '../outputs/dev_predictions.json'
)

## 5. Analyze Results by Question Type

In [None]:
# Load dev dataset to analyze questions
with open('../archive/dev-v1.1.json', 'r') as f:
    dev_data = json.load(f)

# Extract question types and compute metrics per type
question_type_metrics = defaultdict(lambda: {'em': [], 'f1': [], 'count': 0})

for article in dev_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            question_id = qa['id']
            question = qa['question']
            
            # Get question type (first word)
            question_type = question.split()[0].lower() if question else 'unknown'
            
            # Get prediction and reference
            if question_id in full_results['predictions']:
                prediction = full_results['predictions'][question_id]
                reference = qa['answers'][0]['text'] if qa['answers'] else ''
                
                # Compute metrics
                em = compute_exact_match(prediction, reference)
                f1 = compute_f1(prediction, reference)
                
                question_type_metrics[question_type]['em'].append(em)
                question_type_metrics[question_type]['f1'].append(f1)
                question_type_metrics[question_type]['count'] += 1

# Calculate averages
type_summary = []
for qtype, metrics in question_type_metrics.items():
    if metrics['count'] > 10:  # Only include types with >10 examples
        type_summary.append({
            'Question Type': qtype.capitalize(),
            'Count': metrics['count'],
            'EM (%)': np.mean(metrics['em']) * 100,
            'F1 (%)': np.mean(metrics['f1']) * 100
        })

# Create DataFrame and sort by count
type_df = pd.DataFrame(type_summary).sort_values('Count', ascending=False)

print("\n" + "="*70)
print("Performance by Question Type")
print("="*70)
print(type_df.to_string(index=False))
print("="*70)

In [None]:
# Visualize performance by question type
top_types = type_df.head(10)

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# EM scores
axes[0].barh(top_types['Question Type'], top_types['EM (%)'], color='skyblue', edgecolor='black')
axes[0].set_xlabel('Exact Match (%)', fontsize=12)
axes[0].set_title('EM Score by Question Type (Top 10)', fontsize=14, fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# F1 scores
axes[1].barh(top_types['Question Type'], top_types['F1 (%)'], color='lightcoral', edgecolor='black')
axes[1].set_xlabel('F1 Score (%)', fontsize=12)
axes[1].set_title('F1 Score by Question Type (Top 10)', fontsize=14, fontweight='bold')
axes[1].invert_yaxis()
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/question_type_performance.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Error Analysis

In [None]:
# Analyze errors
errors = evaluator.analyze_errors(
    full_results['predictions'],
    full_results['references'],
    top_n=15
)

total_predictions = len(full_results['predictions'])
total_errors = len(errors)
error_rate = (total_errors / total_predictions) * 100

print(f"\n{'='*70}")
print("Error Summary")
print(f"{'='*70}")
print(f"Total Predictions: {total_predictions}")
print(f"Total Errors (EM=0): {total_errors}")
print(f"Error Rate: {error_rate:.2f}%")
print(f"{'='*70}")

In [None]:
# Categorize errors by F1 score
error_categories = {
    'Complete Miss (F1=0)': [e for e in errors if e['f1'] == 0],
    'Partial Match (0<F1<0.5)': [e for e in errors if 0 < e['f1'] < 0.5],
    'Close Match (F1>=0.5)': [e for e in errors if e['f1'] >= 0.5]
}

print("\nError Categories:")
for category, error_list in error_categories.items():
    print(f"  {category}: {len(error_list)} ({len(error_list)/total_errors*100:.1f}%)")

In [None]:
# Visualize error distribution
f1_scores = [e['f1'] for e in errors]

plt.figure(figsize=(12, 6))
plt.hist(f1_scores, bins=50, edgecolor='black', alpha=0.7, color='orange')
plt.xlabel('F1 Score', fontsize=12)
plt.ylabel('Number of Errors', fontsize=12)
plt.title('Distribution of F1 Scores for Incorrect Predictions (EM=0)', fontsize=14, fontweight='bold')
plt.axvline(np.mean(f1_scores), color='red', linestyle='--', linewidth=2,
            label=f'Mean F1: {np.mean(f1_scores):.3f}')
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../outputs/error_f1_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Performance Comparison with Baseline

In [None]:
# Compare with expected BERT-base performance
baseline_scores = {
    'Model': ['BERT-base (Expected)', 'Our Model'],
    'Exact Match': [82.0, full_results['exact_match']],
    'F1 Score': [90.0, full_results['f1']]
}

comparison_df = pd.DataFrame(baseline_scores)

print("\n" + "="*60)
print("Performance Comparison")
print("="*60)
print(comparison_df.to_string(index=False))
print("="*60)

# Calculate differences
em_diff = full_results['exact_match'] - 82.0
f1_diff = full_results['f1'] - 90.0

print(f"\nDifference from Expected BERT-base:")
print(f"  EM: {em_diff:+.2f}%")
print(f"  F1: {f1_diff:+.2f}%")

In [None]:
# Visualize comparison
x = np.arange(len(comparison_df['Model']))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))

rects1 = ax.bar(x - width/2, comparison_df['Exact Match'], width, label='Exact Match', 
                color='skyblue', edgecolor='black')
rects2 = ax.bar(x + width/2, comparison_df['F1 Score'], width, label='F1 Score',
                color='lightcoral', edgecolor='black')

ax.set_ylabel('Score (%)', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], fontsize=11)
ax.legend(fontsize=11)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 100])

# Add value labels on bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=10)

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.savefig('../outputs/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Save Evaluation Report

In [None]:
# Create comprehensive evaluation report
report = {
    'model_info': {
        'model_name': 'BERT-base-uncased',
        'checkpoint': best_model_path,
        'epoch': checkpoint.get('epoch', 'N/A'),
        'global_step': checkpoint.get('global_step', 'N/A')
    },
    'overall_metrics': {
        'exact_match': full_results['exact_match'],
        'f1_score': full_results['f1'],
        'total_examples': len(full_results['predictions'])
    },
    'error_analysis': {
        'total_errors': total_errors,
        'error_rate': error_rate,
        'avg_f1_on_errors': np.mean(f1_scores) if f1_scores else 0
    },
    'question_type_performance': type_df.to_dict('records')
}

# Save report
report_path = '../outputs/evaluation_report.json'
with open(report_path, 'w') as f:
    json.dump(report, f, indent=2)

print(f"\n✓ Evaluation report saved to {report_path}")

In [None]:
# Create markdown report
markdown_report = f"""# BERT-QA Evaluation Report

## Model Information
- **Model**: BERT-base-uncased
- **Checkpoint**: {best_model_path}
- **Epoch**: {checkpoint.get('epoch', 'N/A')}
- **Training Steps**: {checkpoint.get('global_step', 'N/A')}

## Overall Performance
- **Exact Match (EM)**: {full_results['exact_match']:.2f}%
- **F1 Score**: {full_results['f1']:.2f}%
- **Total Examples**: {len(full_results['predictions']):,}

## Comparison with BERT-base Baseline
| Metric | Expected BERT-base | Our Model | Difference |
|--------|-------------------|-----------|------------|
| EM     | 82.0%            | {full_results['exact_match']:.2f}% | {full_results['exact_match']-82.0:+.2f}% |
| F1     | 90.0%            | {full_results['f1']:.2f}% | {full_results['f1']-90.0:+.2f}% |

## Error Analysis
- **Total Errors (EM=0)**: {total_errors:,} ({error_rate:.2f}%)
- **Average F1 on Errors**: {np.mean(f1_scores) if f1_scores else 0:.3f}

## Top Question Types Performance
{type_df.head(10).to_markdown(index=False)}

## Conclusion
The model achieves competitive performance on the SQuAD v1.1 dataset, demonstrating strong 
question-answering capabilities across various question types.

---
*Report generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""

# Save markdown report
md_report_path = '../outputs/evaluation_report.md'
with open(md_report_path, 'w') as f:
    f.write(markdown_report)

print(f"✓ Markdown report saved to {md_report_path}")

## Summary

### Evaluation Complete! ✓

**Key Metrics:**
- Exact Match and F1 scores computed on full dev set
- Performance analyzed by question type
- Error patterns identified and categorized

**Generated Files:**
- `outputs/dev_predictions.json` - All predictions
- `outputs/evaluation_report.json` - Detailed metrics
- `outputs/evaluation_report.md` - Human-readable report
- `outputs/*.png` - Visualization charts

**Next Step:** Create deployment interface for interactive question answering!