# BM25 Retrieval Evaluation

This notebook evaluates BM25 retrieval on:
1. Content field
2. Metadata field

For all 10 legal queries.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

from src.data_loader import load_data, prepare_data, get_documents_by_field
from src.queries import get_all_queries
from src.bm25_retriever import BM25Retriever, evaluate_bm25, save_results
from src.evaluation import print_query_results, create_comparison_table
from src.config import RESULTS_DIR, TOP_K

sns.set_style('whitegrid')
print("✓ Imports successful")

## 1. Load and Prepare Data

In [None]:
# Load data
df = load_data()
df = prepare_data(df)

print(f"Loaded {len(df)} documents")

# Get document lists
documents_content = get_documents_by_field(df, 'content')
documents_metadata = get_documents_by_field(df, 'metadata')

# Get queries
queries = get_all_queries()
print(f"Loaded {len(queries)} queries")

## 2. BM25 Retrieval on Content

In [None]:
# Evaluate BM25 on content
results_content = evaluate_bm25(
    documents=documents_content,
    queries=queries,
    top_k=TOP_K,
    field_name="content"
)

In [None]:
# Display results for first query
query_idx = 0
query_result = results_content['results'][query_idx]

print_query_results(
    df=df,
    query=query_result['query'],
    indices=query_result['retrieved_indices'],
    scores=query_result['scores'],
    top_n=3,
    field='content'
)

## 3. BM25 Retrieval on Metadata

In [None]:
# Evaluate BM25 on metadata
results_metadata = evaluate_bm25(
    documents=documents_metadata,
    queries=queries,
    top_k=TOP_K,
    field_name="metadata"
)

In [None]:
# Display results for first query
query_result = results_metadata['results'][query_idx]

print_query_results(
    df=df,
    query=query_result['query'],
    indices=query_result['retrieved_indices'],
    scores=query_result['scores'],
    top_n=3,
    field='metadata'
)

## 4. Compare Content vs Metadata Retrieval

In [None]:
# Create comparison dataframe
comparison_data = []

for i, query in enumerate(queries):
    content_result = results_content['results'][i]
    metadata_result = results_metadata['results'][i]
    
    comparison_data.append({
        'Query ID': query['id'],
        'Category': query['category'],
        'Content Time (s)': content_result['retrieval_time'],
        'Metadata Time (s)': metadata_result['retrieval_time'],
        'Content Top Score': content_result['scores'][0] if content_result['scores'] else 0,
        'Metadata Top Score': metadata_result['scores'][0] if metadata_result['scores'] else 0,
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df

In [None]:
# Visualize retrieval times
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Retrieval time comparison
axes[0].bar(range(len(queries)), comparison_df['Content Time (s)'], 
            alpha=0.7, label='Content', width=0.4, align='edge')
axes[0].bar([x + 0.4 for x in range(len(queries))], comparison_df['Metadata Time (s)'], 
            alpha=0.7, label='Metadata', width=0.4, align='edge')
axes[0].set_xlabel('Query ID')
axes[0].set_ylabel('Retrieval Time (seconds)')
axes[0].set_title('BM25 Retrieval Time Comparison')
axes[0].legend()
axes[0].set_xticks(range(len(queries)))
axes[0].set_xticklabels([q['id'] for q in queries])

# Top score comparison
axes[1].bar(range(len(queries)), comparison_df['Content Top Score'], 
            alpha=0.7, label='Content', width=0.4, align='edge')
axes[1].bar([x + 0.4 for x in range(len(queries))], comparison_df['Metadata Top Score'], 
            alpha=0.7, label='Metadata', width=0.4, align='edge')
axes[1].set_xlabel('Query ID')
axes[1].set_ylabel('Top-1 BM25 Score')
axes[1].set_title('BM25 Top-1 Score Comparison')
axes[1].legend()
axes[1].set_xticks(range(len(queries)))
axes[1].set_xticklabels([q['id'] for q in queries])

plt.tight_layout()
plt.show()

In [None]:
# Summary statistics
print("BM25 Retrieval Summary:")
print("="*60)
print(f"\nContent Field:")
print(f"  Avg retrieval time: {results_content['avg_retrieval_time']:.4f}s")
print(f"  Avg top-1 score: {comparison_df['Content Top Score'].mean():.4f}")
print(f"\nMetadata Field:")
print(f"  Avg retrieval time: {results_metadata['avg_retrieval_time']:.4f}s")
print(f"  Avg top-1 score: {comparison_df['Metadata Top Score'].mean():.4f}")
print(f"\nSpeed comparison: Metadata is {results_content['avg_retrieval_time']/results_metadata['avg_retrieval_time']:.2f}x faster")

## 5. Analyze Results for Specific Queries

In [None]:
# Interactive query explorer
def explore_query(query_id):
    """Display detailed results for a specific query."""
    query_idx = query_id - 1
    query = queries[query_idx]
    
    print(f"\n{'='*80}")
    print(f"QUERY {query_id}: {query['query']}")
    print(f"Category: {query['category']}")
    print(f"{'='*80}\n")
    
    # Content results
    print("\n--- CONTENT FIELD RESULTS ---")
    content_result = results_content['results'][query_idx]
    print_query_results(
        df, query['query'], 
        content_result['retrieved_indices'],
        content_result['scores'],
        top_n=3, field='content'
    )
    
    # Metadata results
    print("\n--- METADATA FIELD RESULTS ---")
    metadata_result = results_metadata['results'][query_idx]
    print_query_results(
        df, query['query'],
        metadata_result['retrieved_indices'],
        metadata_result['scores'],
        top_n=3, field='metadata'
    )

# Example: Explore query 1
explore_query(1)

In [None]:
# Explore another query of interest
explore_query(2)

## 6. Save Results

In [None]:
# Save both results
save_results(results_content, RESULTS_DIR / "bm25_content_results.json")
save_results(results_metadata, RESULTS_DIR / "bm25_metadata_results.json")

# Save comparison table
comparison_df.to_csv(RESULTS_DIR / "bm25_comparison.csv", index=False)
print("\nAll results saved successfully!")

## Summary

BM25 evaluation complete!

Key findings:
- BM25 retrieval tested on both content and metadata fields
- Metadata retrieval is generally faster due to shorter documents
- Content retrieval may capture more contextual information
- Results saved for comparison with FAISS and reranking

Next steps:
- Build FAISS indices (notebook 03)
- Compare with dense retrieval
- Apply reranking (notebook 05)