## 1. Setup & Dependencies

### 1.1. Install Required Packages

In [3]:
# !pip install pyserini
# !pip install faiss-cpu
# !pip install torch
# !pip install transformers
# !pip install sentence-transformers

In [4]:
import os
import warnings
from typing import Dict, List, Tuple, Optional
from collections import defaultdict
from itertools import product

import numpy as np
import pandas as pd
from tqdm import tqdm

from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

warnings.filterwarnings("ignore")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("✓ Dependencies imported")

  from .autonotebook import tqdm as notebook_tqdm
[0;93m2026-01-09 15:18:52.217615681 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m


Using device: cuda
✓ Dependencies imported


### 1.2. Load Pyserini Index

In [6]:
INDEX_NAME = "robust04"

searcher = LuceneSearcher.from_prebuilt_index(INDEX_NAME)
index_reader = IndexReader.from_prebuilt_index(INDEX_NAME)

print(f"Index: {INDEX_NAME}")
print(f"Total documents: {index_reader.stats()['documents']:,}")
print(f"Total terms: {index_reader.stats()['total_terms']:,}")
print("✓ Pyserini index loaded")

Index: robust04
Total documents: 528,030
Total terms: 174,540,872
✓ Pyserini index loaded


## 2. Data Loading

### 2.1. Load Queries

In [7]:
DATA_DIR = "./data/"

def load_queries(filepath: str) -> Dict[str, str]:
    """Load queries from file. Format: qid<tab>query_text"""
    queries = {}
    with open(filepath, "r") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 2:
                qid, text = parts
                queries[qid] = text
    return queries

all_queries = load_queries(os.path.join(DATA_DIR, "queriesROBUST.txt"))

print(f"Total queries loaded: {len(all_queries)}")
print(f"\nSample queries:")
for qid, text in list(all_queries.items())[:5]:
    print(f"  {qid}: {text}")

Total queries loaded: 249

Sample queries:
  301: international organized crime
  302: poliomyelitis post polio
  303: hubble telescope achievements
  304: endangered species mammals
  305: dangerous vehicles


### 2.2. Load Relevance Judgments

In [8]:
def load_qrels(filepath: str) -> Dict[str, Dict[str, int]]:
    """Load qrels. Format: qid 0 docid relevance"""
    qrels = defaultdict(dict)
    with open(filepath, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 4:
                qid, _, docid, rel = parts[:4]
                qrels[qid][docid] = int(rel)
    return dict(qrels)

qrels = load_qrels(os.path.join(DATA_DIR, "qrels_50_Queries"))

print(f"Queries with relevance judgments: {len(qrels)}")
print(f"Total judgments: {sum(len(v) for v in qrels.values()):,}")
print(f"\nSample qrels for query 301:")
sample_rels = list(qrels.get("301", {}).items())[:5]
for docid, rel in sample_rels:
    print(f"  {docid}: {rel}")

Queries with relevance judgments: 50
Total judgments: 61,511

Sample qrels for query 301:
  FBIS3-10082: 1
  FBIS3-10169: 0
  FBIS3-10243: 1
  FBIS3-10319: 0
  FBIS3-10397: 1


### 2.3. Train/Test Split

In [9]:
train_qids = sorted(qrels.keys())
test_qids = [qid for qid in all_queries.keys() if qid not in train_qids]

train_queries = {qid: all_queries[qid] for qid in train_qids}
test_queries = {qid: all_queries[qid] for qid in test_qids}

print(f"Training queries: {len(train_queries)} (with qrels)")
print(f"Test queries: {len(test_queries)} (no qrels)")
print(f"\nTrain QIDs: {train_qids[:10]}...")
print(f"Test QIDs: {test_qids[:10]}...")

Training queries: 50 (with qrels)
Test queries: 199 (no qrels)

Train QIDs: ['301', '302', '303', '304', '305', '306', '307', '308', '309', '310']...
Test QIDs: ['351', '352', '353', '354', '355', '356', '357', '358', '359', '360']...


## 3. Evaluation Framework

### 3.1. MAP Computation

In [10]:
def compute_ap(ranked_docs: List[str], relevance: Dict[str, int]) -> float:
    """Compute Average Precision for a single query."""
    relevant = {d for d, r in relevance.items() if r > 0}
    if not relevant:
        return 0.0
    
    hits = 0
    precision_sum = 0.0
    
    for i, doc in enumerate(ranked_docs):
        if doc in relevant:
            hits += 1
            precision_sum += hits / (i + 1)
    
    return precision_sum / len(relevant)


def compute_map(
    run: Dict[str, List[Tuple[str, float]]],
    qrels: Dict[str, Dict[str, int]]
) -> float:
    """Compute Mean Average Precision over all queries."""
    aps = []
    for qid, results in run.items():
        if qid in qrels:
            ranked_docs = [doc for doc, _ in results]
            ap = compute_ap(ranked_docs, qrels[qid])
            aps.append(ap)
    return np.mean(aps) if aps else 0.0


print("✓ MAP evaluation functions defined")

✓ MAP evaluation functions defined


### 3.2. Evaluation Utilities

In [11]:
def search_batch(
    searcher: LuceneSearcher,
    queries: Dict[str, str],
    k: int = 1000
) -> Dict[str, List[Tuple[str, float]]]:
    """Run batch search and return results."""
    results = {}
    for qid, query_text in tqdm(queries.items(), desc="Searching"):
        hits = searcher.search(query_text, k=k)
        results[qid] = [(hit.docid, hit.score) for hit in hits]
    return results


def evaluate_run(
    run: Dict[str, List[Tuple[str, float]]],
    qrels: Dict[str, Dict[str, int]],
    run_name: str = "run"
) -> Dict[str, float]:
    """Evaluate a run and return metrics."""
    map_score = compute_map(run, qrels)
    
    # Per-query AP
    per_query = {}
    for qid in run:
        if qid in qrels:
            ranked = [d for d, _ in run[qid]]
            per_query[qid] = compute_ap(ranked, qrels[qid])
    
    return {
        "run_name": run_name,
        "map": map_score,
        "num_queries": len(per_query),
        "per_query_ap": per_query
    }


print("✓ Evaluation utilities defined")

✓ Evaluation utilities defined


## 4. Method 1: BM25 Baseline

### 4.1. Implementation

In [None]:
def create_bm25_searcher(k1: float = 0.9, b: float = 0.4) -> LuceneSearcher:
    """Create BM25 searcher with specified parameters."""
    s = LuceneSearcher.from_prebuilt_index(INDEX_NAME)
    s.set_bm25(k1=k1, b=b)
    return s


print("✓ BM25 searcher factory defined")

### 4.2. Parameter Tuning

In [None]:
bm25_param_grid = {
    "k1": [0.6, 0.9, 1.2, 1.5, 2.0],
    "b": [0.3, 0.4, 0.5, 0.6, 0.75]
}

bm25_results = []

print("Tuning BM25 parameters...\n")

for k1, b in product(bm25_param_grid["k1"], bm25_param_grid["b"]):
    s = create_bm25_searcher(k1=k1, b=b)
    run = search_batch(s, train_queries, k=1000)
    metrics = evaluate_run(run, qrels, f"BM25_k1={k1}_b={b}")
    
    bm25_results.append({
        "k1": k1,
        "b": b,
        "map": metrics["map"]
    })
    print(f"k1={k1}, b={b} -> MAP={metrics['map']:.4f}")

bm25_df = pd.DataFrame(bm25_results).sort_values("map", ascending=False)
print("\n" + "="*50)
print("BM25 Tuning Results (Top 5):")
print("="*50)
print(bm25_df.head())

### 4.3. Validation Results

In [None]:
best_bm25 = bm25_df.iloc[0]
best_k1, best_b = best_bm25["k1"], best_bm25["b"]

print(f"Best BM25 Parameters:")
print(f"  k1 = {best_k1}")
print(f"  b = {best_b}")
print(f"  MAP = {best_bm25['map']:.4f}")

bm25_searcher = create_bm25_searcher(k1=best_k1, b=best_b)
bm25_run_train = search_batch(bm25_searcher, train_queries, k=1000)

print("\n✓ BM25 baseline configured")

## 5. Method 2: BM25 + RM3 (Query Expansion)

### 5.1. Implementation

In [None]:
def create_rm3_searcher(
    k1: float = 0.9,
    b: float = 0.4,
    fb_terms: int = 10,
    fb_docs: int = 10,
    original_weight: float = 0.5
) -> LuceneSearcher:
    """Create BM25+RM3 searcher with specified parameters."""
    s = LuceneSearcher.from_prebuilt_index(INDEX_NAME)
    s.set_bm25(k1=k1, b=b)
    s.set_rm3(fb_terms=fb_terms, fb_docs=fb_docs, original_query_weight=original_weight)
    return s


print("✓ RM3 searcher factory defined")

### 5.2. Parameter Tuning

In [None]:
rm3_param_grid = {
    "fb_terms": [10, 20, 30],
    "fb_docs": [5, 10, 15],
    "original_weight": [0.3, 0.5, 0.7]
}

rm3_results = []

print("Tuning RM3 parameters (using best BM25 params)...\n")

for fb_terms, fb_docs, orig_w in product(
    rm3_param_grid["fb_terms"],
    rm3_param_grid["fb_docs"],
    rm3_param_grid["original_weight"]
):
    s = create_rm3_searcher(
        k1=best_k1, b=best_b,
        fb_terms=fb_terms, fb_docs=fb_docs, original_weight=orig_w
    )
    run = search_batch(s, train_queries, k=1000)
    metrics = evaluate_run(run, qrels)
    
    rm3_results.append({
        "fb_terms": fb_terms,
        "fb_docs": fb_docs,
        "original_weight": orig_w,
        "map": metrics["map"]
    })
    print(f"fb_terms={fb_terms}, fb_docs={fb_docs}, orig_w={orig_w} -> MAP={metrics['map']:.4f}")

rm3_df = pd.DataFrame(rm3_results).sort_values("map", ascending=False)
print("\n" + "="*50)
print("RM3 Tuning Results (Top 5):")
print("="*50)
print(rm3_df.head())

### 5.3. Validation Results

In [None]:
best_rm3 = rm3_df.iloc[0]
best_fb_terms = int(best_rm3["fb_terms"])
best_fb_docs = int(best_rm3["fb_docs"])
best_orig_w = best_rm3["original_weight"]

print(f"Best RM3 Parameters:")
print(f"  fb_terms = {best_fb_terms}")
print(f"  fb_docs = {best_fb_docs}")
print(f"  original_weight = {best_orig_w}")
print(f"  MAP = {best_rm3['map']:.4f}")

rm3_searcher = create_rm3_searcher(
    k1=best_k1, b=best_b,
    fb_terms=best_fb_terms, fb_docs=best_fb_docs, original_weight=best_orig_w
)
rm3_run_train = search_batch(rm3_searcher, train_queries, k=1000)

improvement = (best_rm3['map'] - best_bm25['map']) / best_bm25['map'] * 100
print(f"\nImprovement over BM25: {improvement:+.2f}%")
print("\n✓ RM3 configured")

## 6. Method 3: Hybrid Neural Re-ranking (Advanced)

### 6.1. First-Stage Retrieval

In [None]:
RERANK_DEPTH = 100  # Number of candidates to re-rank

def get_candidates(
    searcher: LuceneSearcher,
    queries: Dict[str, str],
    k: int = RERANK_DEPTH
) -> Dict[str, List[Tuple[str, float]]]:
    """Get BM25 candidates for re-ranking."""
    return search_batch(searcher, queries, k=k)


print(f"✓ First-stage retrieval configured (depth={RERANK_DEPTH})")

### 6.2. Cross-Encoder Re-ranking

In [None]:
MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

print(f"Loading cross-encoder: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
cross_encoder = cross_encoder.to(DEVICE)
cross_encoder.eval()

print(f"✓ Cross-encoder loaded on {DEVICE}")

In [None]:
def get_doc_text(docid: str) -> str:
    """Retrieve document text from index."""
    doc = searcher.doc(docid)
    if doc is None:
        return ""
    raw = doc.raw()
    # Extract text content
    if raw:
        return raw[:2000]  # Truncate for efficiency
    return ""


@torch.no_grad()
def rerank_with_cross_encoder(
    query: str,
    candidates: List[Tuple[str, float]],
    batch_size: int = 32
) -> List[Tuple[str, float]]:
    """Re-rank candidates using cross-encoder."""
    if not candidates:
        return []
    
    docids = [d for d, _ in candidates]
    docs = [get_doc_text(d) for d in docids]
    
    scores = []
    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i:i+batch_size]
        pairs = [[query, doc] for doc in batch_docs]
        
        inputs = tokenizer(
            pairs,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(DEVICE)
        
        outputs = cross_encoder(**inputs)
        batch_scores = outputs.logits.squeeze(-1).cpu().numpy()
        scores.extend(batch_scores.tolist() if batch_scores.ndim > 0 else [batch_scores.item()])
    
    # Sort by neural score
    reranked = sorted(zip(docids, scores), key=lambda x: x[1], reverse=True)
    return reranked


print("✓ Cross-encoder re-ranking functions defined")

### 6.3. Reciprocal Rank Fusion

In [None]:
def reciprocal_rank_fusion(
    runs: List[List[Tuple[str, float]]],
    k: int = 60
) -> List[Tuple[str, float]]:
    """
    Combine multiple ranked lists using RRF.
    RRF_score(d) = sum(1 / (k + rank_i(d))) for each run i
    """
    rrf_scores = defaultdict(float)
    
    for run in runs:
        for rank, (docid, _) in enumerate(run):
            rrf_scores[docid] += 1.0 / (k + rank + 1)
    
    fused = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    return fused


print("✓ RRF fusion function defined")

In [None]:
def hybrid_search(
    queries: Dict[str, str],
    bm25_searcher: LuceneSearcher,
    rerank_depth: int = 100,
    rrf_k: int = 60,
    final_k: int = 1000
) -> Dict[str, List[Tuple[str, float]]]:
    """Full hybrid pipeline: BM25 -> Neural Re-rank -> RRF Fusion."""
    results = {}
    
    for qid, query_text in tqdm(queries.items(), desc="Hybrid search"):
        # First-stage: BM25
        bm25_hits = bm25_searcher.search(query_text, k=rerank_depth)
        bm25_candidates = [(hit.docid, hit.score) for hit in bm25_hits]
        
        # Second-stage: Neural re-ranking
        neural_reranked = rerank_with_cross_encoder(query_text, bm25_candidates)
        
        # Fusion: RRF
        fused = reciprocal_rank_fusion([bm25_candidates, neural_reranked], k=rrf_k)
        
        # Ensure we have 1000 results (pad with BM25 if needed)
        if len(fused) < final_k:
            extra_hits = bm25_searcher.search(query_text, k=final_k)
            seen = {d for d, _ in fused}
            for hit in extra_hits:
                if hit.docid not in seen:
                    fused.append((hit.docid, 0.0))
                    if len(fused) >= final_k:
                        break
        
        results[qid] = fused[:final_k]
    
    return results


print("✓ Hybrid search pipeline defined")

### 6.4. Parameter Tuning

In [None]:
hybrid_param_grid = {
    "rerank_depth": [50, 100],
    "rrf_k": [30, 60, 90]
}

hybrid_results = []

print("Tuning hybrid parameters...\n")

for rerank_depth, rrf_k in product(
    hybrid_param_grid["rerank_depth"],
    hybrid_param_grid["rrf_k"]
):
    run = hybrid_search(
        train_queries,
        bm25_searcher,
        rerank_depth=rerank_depth,
        rrf_k=rrf_k
    )
    metrics = evaluate_run(run, qrels)
    
    hybrid_results.append({
        "rerank_depth": rerank_depth,
        "rrf_k": rrf_k,
        "map": metrics["map"]
    })
    print(f"rerank_depth={rerank_depth}, rrf_k={rrf_k} -> MAP={metrics['map']:.4f}")

hybrid_df = pd.DataFrame(hybrid_results).sort_values("map", ascending=False)
print("\n" + "="*50)
print("Hybrid Tuning Results:")
print("="*50)
print(hybrid_df)

### 6.5. Validation Results

In [None]:
best_hybrid = hybrid_df.iloc[0]
best_rerank_depth = int(best_hybrid["rerank_depth"])
best_rrf_k = int(best_hybrid["rrf_k"])

print(f"Best Hybrid Parameters:")
print(f"  rerank_depth = {best_rerank_depth}")
print(f"  rrf_k = {best_rrf_k}")
print(f"  MAP = {best_hybrid['map']:.4f}")

hybrid_run_train = hybrid_search(
    train_queries,
    bm25_searcher,
    rerank_depth=best_rerank_depth,
    rrf_k=best_rrf_k
)

improvement = (best_hybrid['map'] - best_bm25['map']) / best_bm25['map'] * 100
print(f"\nImprovement over BM25: {improvement:+.2f}%")
print("\n✓ Hybrid neural re-ranking configured")

## 7. Results Summary

### 7.1. Training Performance Comparison

In [None]:
summary = pd.DataFrame([
    {
        "Method": "BM25",
        "MAP": best_bm25["map"],
        "Parameters": f"k1={best_k1}, b={best_b}"
    },
    {
        "Method": "BM25 + RM3",
        "MAP": best_rm3["map"],
        "Parameters": f"fb_terms={best_fb_terms}, fb_docs={best_fb_docs}, orig_w={best_orig_w}"
    },
    {
        "Method": "Hybrid Neural (Advanced)",
        "MAP": best_hybrid["map"],
        "Parameters": f"rerank_depth={best_rerank_depth}, rrf_k={best_rrf_k}"
    }
])

print("="*80)
print("TRAINING PERFORMANCE SUMMARY (50 queries)")
print("="*80)
print(summary.to_string(index=False))
print("="*80)

### 7.2. Analysis

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 5))

methods = summary["Method"].tolist()
maps = summary["MAP"].tolist()
colors = ["#3498db", "#2ecc71", "#e74c3c"]

bars = ax.bar(methods, maps, color=colors, edgecolor="black")
ax.set_ylabel("MAP")
ax.set_title("Training Performance Comparison (50 queries)")
ax.set_ylim(0, max(maps) * 1.15)

for bar, m in zip(bars, maps):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
            f"{m:.4f}", ha="center", va="bottom", fontsize=11)

plt.tight_layout()
plt.show()

print("\n✓ All methods tuned and evaluated on training queries")
print("\n→ Ready to generate submission files for test queries")

## 8. Generate Submission Files

### 8.1. Run Inference on Test Queries

In [None]:
print(f"Running inference on {len(test_queries)} test queries...\n")

# Method 1: BM25
print("Method 1: BM25")
run_1 = search_batch(bm25_searcher, test_queries, k=1000)
print(f"  ✓ {len(run_1)} queries processed")

# Method 2: BM25 + RM3
print("\nMethod 2: BM25 + RM3")
run_2 = search_batch(rm3_searcher, test_queries, k=1000)
print(f"  ✓ {len(run_2)} queries processed")

# Method 3: Hybrid Neural
print("\nMethod 3: Hybrid Neural Re-ranking")
run_3 = hybrid_search(
    test_queries,
    bm25_searcher,
    rerank_depth=best_rerank_depth,
    rrf_k=best_rrf_k,
    final_k=1000
)
print(f"  ✓ {len(run_3)} queries processed")

print("\n✓ All test queries processed")

### 8.2. Export TREC Format

In [None]:
def write_trec_run(
    run: Dict[str, List[Tuple[str, float]]],
    filepath: str,
    run_name: str
):
    """
    Write run to TREC format.
    Format: topic_id Q0 doc_id rank score run_name
    """
    with open(filepath, "w") as f:
        for qid in sorted(run.keys(), key=lambda x: int(x)):
            results = run[qid]
            # Sort by score descending
            sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
            for rank, (docid, score) in enumerate(sorted_results[:1000], start=1):
                f.write(f"{qid} Q0 {docid} {rank} {score:.6f} {run_name}\n")
    
    print(f"✓ Written: {filepath}")


print("Writing submission files...\n")

write_trec_run(run_1, "run_1.res", "run_1")
write_trec_run(run_2, "run_2.res", "run_2")
write_trec_run(run_3, "run_3.res", "run_3")

print("\n✓ All submission files written")

### 8.3. Validate Output

In [None]:
def validate_run_file(filepath: str, expected_queries: int = 199, docs_per_query: int = 1000):
    """Validate TREC run file format and contents."""
    query_docs = defaultdict(list)
    
    with open(filepath, "r") as f:
        for line in f:
            parts = line.strip().split()
            qid, _, docid, rank, score, _ = parts
            query_docs[qid].append((int(rank), float(score)))
    
    # Check query count
    assert len(query_docs) == expected_queries, f"Expected {expected_queries} queries, got {len(query_docs)}"
    
    # Check docs per query and score ordering
    for qid, docs in query_docs.items():
        assert len(docs) == docs_per_query, f"Query {qid}: expected {docs_per_query} docs, got {len(docs)}"
        scores = [s for _, s in sorted(docs, key=lambda x: x[0])]
        assert scores == sorted(scores, reverse=True), f"Query {qid}: scores not in decreasing order"
    
    print(f"✓ {filepath}: {len(query_docs)} queries × {docs_per_query} docs, scores non-increasing")


print("Validating submission files...\n")

validate_run_file("run_1.res")
validate_run_file("run_2.res")
validate_run_file("run_3.res")

print("\n" + "="*60)
print("SUBMISSION FILES READY")
print("="*60)
print("Files: run_1.res, run_2.res, run_3.res")
print("Format: TREC 6-column")
print("="*60)