## 1. Setup & Dependencies

In [2]:
import pandas as pd
import json
import re
import string
from collections import Counter
from typing import List, Dict, Tuple, Optional, Literal
from dataclasses import dataclass
from tqdm.auto import tqdm
import itertools
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')

print("✓ Dependencies imported")

✓ Dependencies imported


  from .autonotebook import tqdm as notebook_tqdm


### Install Required Packages

In [3]:
# !pip install torch torchvision torchaudio
# !pip install pyserini==0.36.0
# !pip install accelerate
# !pip install transformers
# !pip install tqdm
# !pip install python-dotenv

In [4]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

!java -version

openjdk version "21.0.9" 2025-10-21
OpenJDK Runtime Environment (build 21.0.9+10-Ubuntu-122.04)
OpenJDK 64-Bit Server VM (build 21.0.9+10-Ubuntu-122.04, mixed mode, sharing)


In [5]:
# !pip install torch torchvision torchaudio
# !pip install pyserini==0.36.0
# !pip install accelerate
# !pip install transformers
# !pip install tqdm
# !pip install python-dotenv

### Hugging Face Authentication

In [6]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

login(os.getenv('HUGGING_FACE_TOKEN'))
print("✓ Logged into Hugging Face")

✓ Logged into Hugging Face


## 2. Data Loading & Preparation

In [7]:
# Load datasets
df_train = pd.read_csv("./data/train.csv", converters={"answers": json.loads})
df_test = pd.read_csv("./data/test.csv")

print(f"Train set: {len(df_train)} questions")
print(f"Test set: {len(df_test)} questions")
print(f"\nSample question: {df_train.iloc[0]['question']}")
print(f"Sample answers: {df_train.iloc[0]['answers']}")

Train set: 3778 questions
Test set: 2032 questions

Sample question: what is the name of justin bieber brother?
Sample answers: ['Jazmyn Bieber', 'Jaxon Bieber']


In [8]:
# Create train/validation split for experiments
RANDOM_SEED = 42
VAL_SIZE = 0.2

df_train_split = df_train.sample(frac=1-VAL_SIZE, random_state=RANDOM_SEED)
df_val = df_train.drop(df_train_split.index).reset_index(drop=True)
df_train_split = df_train_split.reset_index(drop=True)

print(f"✓ Split data:")
print(f"   Training: {len(df_train_split)} questions")
print(f"   Validation: {len(df_val)} questions")

✓ Split data:
   Training: 3022 questions
   Validation: 756 questions


## 3. Retrieval Functions

In [9]:
from pyserini.search import SimpleSearcher
from pyserini.index.lucene import IndexReader

# Load Pyserini index
print("Loading Pyserini index...")
searcher = SimpleSearcher.from_prebuilt_index('wikipedia-kilt-doc')
index_reader = IndexReader.from_prebuilt_index('wikipedia-kilt-doc')

print(f"✓ Index loaded: {index_reader.stats()['documents']} documents")

[0;93m2025-12-13 15:24:53.949730162 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m


Loading Pyserini index...


Dec 13, 2025 3:24:54 PM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


SimpleSearcher class has been deprecated, please use LuceneSearcher from pyserini.search.lucene instead
✓ Index loaded: 5903530 documents


In [10]:
from sentence_transformers import SentenceTransformer
import torch

# Load bi-encoder
print("Loading bi-encoder...")
device = "cuda" if torch.cuda.is_available() else "cpu"
bi_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

print("✓ Bi-encoder loaded")

Loading bi-encoder...
✓ Bi-encoder loaded


In [11]:
from dataclasses import dataclass
from typing import List, Literal
import json
import torch
from sentence_transformers import util


@dataclass
class RetrievalConfig:
    """
    Passage-based retrieval configuration with optional bi-encoder reranking.
    """
    k_docs: int = 5
    k_passages: int = 3
    method: Literal["bm25", "qld"] = "qld"
    use_rerank: bool = True

    mu: int = 1000          # QLD smoothing
    k1: float = 0.9         # BM25
    b: float = 0.4          # BM25

    window: int = 150
    overlap: int = 50
    min_passage_words: int = 30

    def __str__(self):
        method_str = (
            f"QLD(mu={self.mu})"
            if self.method == "qld"
            else f"BM25(k1={self.k1}, b={self.b})"
        )
        rerank_str = "BiEncoder" if self.use_rerank else "NoRerank"
        return (
            f"{method_str} → {rerank_str} | "
            f"k_docs={self.k_docs}, k_passages={self.k_passages}"
        )


def extract_passages(
    text: str,
    window: int,
    overlap: int,
    min_words: int,
) -> List[str]:
    """
    Split text into overlapping word windows.
    """
    if not text:
        return []

    words = text.split()
    if len(words) < min_words:
        return []

    step = max(1, window - overlap)
    passages = []

    for i in range(0, len(words), step):
        chunk = words[i:i + window]
        if len(chunk) < min_words:
            break
        passages.append(" ".join(chunk))

    return passages


def bi_encoder_rerank(
    query: str,
    passages: List[str],
    top_k: int,
) -> List[str]:
    """
    Rerank passages using bi-encoder cosine similarity.
    """
    if not passages:
        return []

    q_emb = bi_encoder.encode(query, convert_to_tensor=True, device=device)
    p_embs = bi_encoder.encode(passages, convert_to_tensor=True, device=device)

    scores = util.cos_sim(q_emb, p_embs).squeeze(0)
    top_k = min(top_k, len(passages))
    idx = torch.topk(scores, k=top_k).indices.tolist()

    return [passages[i] for i in idx]


def retrieve_context(query: str, cfg: RetrievalConfig) -> List[str]:
    """
    Retrieve passages using BM25 or QLD, optionally followed by bi-encoder reranking.
    """
    if cfg.method == "bm25":
        searcher.set_bm25(cfg.k1, cfg.b)
    else:
        searcher.set_qld(cfg.mu)

    hits = searcher.search(query, cfg.k_docs)
    passages: List[str] = []

    for hit in hits:
        try:
            doc = searcher.doc(hit.docid)
            content = json.loads(doc.raw()).get("contents", "").replace("\n", " ")
            passages.extend(
                extract_passages(
                    content,
                    cfg.window,
                    cfg.overlap,
                    cfg.min_passage_words,
                )
            )
        except Exception:
            continue

    if not cfg.use_rerank:
        return passages[:cfg.k_passages]

    return bi_encoder_rerank(query, passages, cfg.k_passages)


query = "Who wrote Harry Potter?"

configs = [
    RetrievalConfig(method="qld", use_rerank=False),
    RetrievalConfig(method="qld", use_rerank=True),
    RetrievalConfig(method="bm25", use_rerank=False),
    RetrievalConfig(method="bm25", use_rerank=True),
]

for cfg in configs:
    print(cfg)
    test_passages = retrieve_context(query, cfg)
    for i, p in enumerate(test_passages, 1):
        print(f"{i}. {p[:100]}...")
    print()


QLD(mu=1000) → NoRerank | k_docs=5, k_passages=3
1. Harry Potter Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The...
2. June 1997, the books have found immense popularity, critical acclaim and commercial success worldwid...
3. English by two major publishers, Bloomsbury in the United Kingdom and Scholastic Press in the United...

QLD(mu=1000) → BiEncoder | k_docs=5, k_passages=3
1. the film, has denied that Rowling ever saw it before writing her book. Rowling has said on record mu...
2. Harry Potter Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The...
3. by Emily Brontë, "Charlie and the Chocolate Factory" by Roald Dahl, "Robinson Crusoe" by Daniel Defo...

BM25(k1=0.9, b=0.4) → NoRerank | k_docs=5, k_passages=3
1. Bonnie Wright Bonnie Francesca Wright (born 17 February 1991) is an English actress, film director, ...
2. the Deathly Hallows – Part 1" and "Part 2", she began attending London's University of the 

## 4. LLM Generation

In [12]:
import transformers
import torch
import logging

# Suppress transformers warnings
transformers.logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)

print("Loading LLM model...")
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.float16},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Set pad_token for batch processing
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token

print(f"✓ Model loaded on: {'GPU' if torch.cuda.is_available() else 'CPU'}")

Loading LLM model...
✓ Model loaded on: GPU


In [13]:
DEFAULT_SYSTEM_PROMPT = (
    "You must respond based strictly on the information in provided passages."
    "Do not incorporate any external knowledge or infer any details beyond what is given."
    "If the answer is not in the context, return 'I dont know'."
    "Do not include explanations, only the final answer!"
)

DEFAULT_USER_PROMPT = (
    "Based on the following documents, provide a concise answer to the question.\n\n"
    "{context}\n\n"
    "Question: {question}\n\n"
    "Answer:"
)

@dataclass
class PromptConfig:
    """Configuration for prompt generation and LLM parameters."""
    system_prompt: str = DEFAULT_SYSTEM_PROMPT
    user_prompt: str = DEFAULT_USER_PROMPT
    temperature: float = 0.1
    top_p: float = 0.9
    max_new_tokens: int = 256
    do_sample: bool = True
    
    def __str__(self):
        return f"temp={self.temperature}, top_p={self.top_p}, max_tokens={self.max_new_tokens}"

def clean_answer(answer: str) -> str:
    """Clean and standardize the generated answer."""
    answer = re.sub(r'^(Answer|The answer is|Based on the .*?,):?\s*', '', answer, flags=re.I)
    answer = answer.rstrip('.')
    if any(phrase in answer.lower() for phrase in ["dont know", "don't know", "do not know", "unknown"]):
        return "unknown"
    return answer.strip()

def create_messages(question: str, contexts: List[str], config: PromptConfig) -> List[Dict]:
    """Create messages for the LLM based on the question, contexts, and prompt configuration."""
    if not contexts:
        context_str = "No relevant documents found."
    else:
        context_str = '\n\n'.join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(contexts)])
    
    return [
        {"role": "system", "content": config.system_prompt},
        {"role": "user", "content": config.user_prompt.format(context=context_str, question=question)}
    ]

def generate_answer(question: str, contexts: List[str], config: PromptConfig) -> str:
    """Generate an answer using the LLM based on the question, contexts, and prompt configuration."""
    messages = create_messages(question, contexts, config)
    
    outputs = pipeline(
        messages,
        max_new_tokens=config.max_new_tokens,
        eos_token_id=terminators,
        do_sample=config.do_sample,
        temperature=config.temperature,
        top_p=config.top_p,
    )
    
    answer = outputs[0]["generated_text"][-1].get('content', '')
    return clean_answer(answer)

def batch_generate_answers(questions: List[str], contexts_list: List[List[str]], config: PromptConfig) -> List[str]:
    """Generate answers for multiple questions in batch."""
    # Create messages for all questions
    batch_messages = [create_messages(q, ctx, config) for q, ctx in zip(questions, contexts_list)]
    
    # Process batch through pipeline
    outputs = pipeline(
        batch_messages,
        max_new_tokens=config.max_new_tokens,
        eos_token_id=terminators,
        do_sample=config.do_sample,
        temperature=config.temperature,
        top_p=config.top_p
    )
    
    # Extract and clean answers
    answers = []
    for output in outputs:
        answer = output[0]["generated_text"][-1].get('content', '')
        answers.append(clean_answer(answer))
    
    return answers

test_prompt_config = PromptConfig(temperature=0.1)
print(f"Testing: {test_prompt_config}")
test_answer = generate_answer(query, test_passages, test_prompt_config)
print(f"✓ Generated answer: '{test_answer}'")

Testing: temp=0.1, top_p=0.9, max_tokens=256
✓ Generated answer: 'J. K. Rowling'


## 5. Evaluation Metrics

In [14]:
def normalize_answer(s: str) -> str:
    """Normalize answer for comparison"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    
    def white_space_fix(text):
        return ' '.join(text.split())
    
    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    
    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction: str, ground_truth: str) -> float:
    """Compute token-level F1 score"""
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return int(pred_tokens == gt_tokens)
    if num_same == 0:
        return 0
    
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def evaluate_predictions(df_gold: pd.DataFrame, predictions: Dict[int, str]) -> Dict:
    """
    Evaluate predictions against ground truth.
    
    Returns:
        Dict with average metrics and individual scores:
        {
            'f1': average_f1,
            'precision': average_precision,
            'recall': average_recall,
            'exact_match': exact_match_percentage,
            'f1_scores': list of individual f1 scores,
            'precision_scores': list of individual precision scores,
            'recall_scores': list of individual recall scores,
            'exact_matches': list of individual exact match flags
        }
    """
    f1_scores = []
    precision_scores = []
    recall_scores = []
    exact_matches = []
    
    for _, row in df_gold.iterrows():
        qid = row['id']
        if qid not in predictions:
            f1_scores.append(0.0)
            precision_scores.append(0.0)
            recall_scores.append(0.0)
            exact_matches.append(0)
            continue
        
        prediction = predictions[qid]
        ground_truths = row['answers']
        
        # Compute metrics for each ground truth and take the best
        best_f1 = 0.0
        best_precision = 0.0
        best_recall = 0.0
        is_exact = 0
        
        for gt in ground_truths:
            # F1 score
            pred_tokens = normalize_answer(prediction).split()
            gt_tokens = normalize_answer(gt).split()
            
            common = Counter(pred_tokens) & Counter(gt_tokens)
            num_same = sum(common.values())
            
            if len(pred_tokens) == 0 or len(gt_tokens) == 0:
                f1 = int(pred_tokens == gt_tokens)
                prec = int(pred_tokens == gt_tokens)
                rec = int(pred_tokens == gt_tokens)
            elif num_same == 0:
                f1 = 0.0
                prec = 0.0
                rec = 0.0
            else:
                prec = num_same / len(pred_tokens)
                rec = num_same / len(gt_tokens)
                f1 = (2 * prec * rec) / (prec + rec)
            
            # Track best scores
            if f1 > best_f1:
                best_f1 = f1
                best_precision = prec
                best_recall = rec
            
            # Exact match
            if normalize_answer(prediction) == normalize_answer(gt):
                is_exact = 1
        
        f1_scores.append(best_f1)
        precision_scores.append(best_precision)
        recall_scores.append(best_recall)
        exact_matches.append(is_exact)
    
    return {
        'f1': 100.0 * sum(f1_scores) / len(f1_scores) if f1_scores else 0.0,
        'precision': 100.0 * sum(precision_scores) / len(precision_scores) if precision_scores else 0.0,
        'recall': 100.0 * sum(recall_scores) / len(recall_scores) if recall_scores else 0.0,
        'exact_match': 100.0 * sum(exact_matches) / len(exact_matches) if exact_matches else 0.0,
        'f1_scores': f1_scores,
        'precision_scores': precision_scores,
        'recall_scores': recall_scores,
        'exact_matches': exact_matches
    }


# Test evaluation
test_predictions = {1: "J.K. Rowling", 2: "Paris", 3: "Shakespeare"}
test_gold = pd.DataFrame({
    'id': [1, 2, 3],
    'answers': [["J.K. Rowling", "Rowling"], ["Earth"], ["William Shakespeare", "Shakespeare"]]
})

test_metrics = evaluate_predictions(test_gold, test_predictions)
print(f"✓ Evaluation test: F1={test_metrics['f1']:.2f}, P={test_metrics['precision']:.2f}, R={test_metrics['recall']:.2f}, EM={test_metrics['exact_match']:.2f}")

✓ Evaluation test: F1=66.67, P=66.67, R=66.67, EM=66.67


## 6. Experiment Framework

In [None]:
def run_experiment(
    name: str,
    df_data: pd.DataFrame,
    retrieval_config: RetrievalConfig,
    prompt_config: PromptConfig,
    max_questions: Optional[int] = None,
    batch_size: int = 8,
    verbose: bool = True
) -> Dict:
    if max_questions:
        df_data = df_data.head(max_questions)
    
    predictions = {}
    
    # Process in batches
    num_batches = (len(df_data) + batch_size - 1) // batch_size
    iterator = tqdm(range(num_batches), desc=name) if verbose else range(num_batches)
    
    for batch_idx in iterator:
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(df_data))
        batch_df = df_data.iloc[start_idx:end_idx]
        
        # Retrieve contexts for all questions in batch
        batch_questions = []
        batch_qids = []
        batch_contexts = []
        
        for _, row in batch_df.iterrows():
            question = row['question']
            qid = row['id']
            contexts = retrieve_context(question, retrieval_config)
            
            batch_questions.append(question)
            batch_qids.append(qid)
            batch_contexts.append(contexts)
        
        # Generate answers in batch
        batch_answers = batch_generate_answers(batch_questions, batch_contexts, prompt_config)
        
        # Store predictions
        for qid, answer in zip(batch_qids, batch_answers):
            predictions[qid] = answer
    
    metrics = evaluate_predictions(df_data, predictions)
    
    result = {
        'name': name,
        'retrieval': retrieval_config,
        'prompt': prompt_config,
        'f1_score': metrics['f1'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'exact_match': metrics['exact_match'],
        'num_questions': len(df_data),
        'predictions': predictions,
        'f1_scores': metrics['f1_scores'],
        'precision_scores': metrics['precision_scores'],
        'recall_scores': metrics['recall_scores'],
        'exact_matches': metrics['exact_matches']
    }
    
    if verbose:
        print(f"\n{name}")
        print(f"   Retrieval: {retrieval_config}")
        print(f"   Prompt: {prompt_config}")
        print(f"   F1={metrics['f1']:.2f} | P={metrics['precision']:.2f} | R={metrics['recall']:.2f} | EM={metrics['exact_match']:.2f}")
        print(f"   Questions: {len(df_data)}\n")
    
    return result

# Test experiment
test_retrieval = RetrievalConfig()
test_prompt = PromptConfig(temperature=0.1)
print(f"Testing experiment with:")
print(f"  Retrieval: {test_retrieval}")
print(f"  Prompt: {test_prompt}")

test_exp = run_experiment(
    "Quick Test",
    df_val.head(5),
    test_retrieval,
    test_prompt,
    verbose=True
)

print(f"✓ Experiment framework ready")

Testing experiment with:
  Retrieval: QLD(mu=1000) → BiEncoder | k_docs=5, k_passages=3
  Prompt: temp=0.1, top_p=0.9, max_tokens=256
✓ Experiment framework ready


## 7. Experiments

In [20]:
# Global experiment settings
EXPERIMENT_SEED = 42
EXPERIMENT_QUESTIONS = 100
EXPERIMENT_LOG_PATH = "./results/grid_search_results.csv"

def build_cfg(base: dict, override: dict) -> RetrievalConfig:
    """Build RetrievalConfig from base + override dictionaries."""
    return RetrievalConfig(**{**base, **override})

print("="*80)
print("PHASED RETRIEVAL EXPERIMENT FRAMEWORK")
print("="*80)
print(f"Validation questions per config: {EXPERIMENT_QUESTIONS}")
print(f"Random seed: {EXPERIMENT_SEED}")
print(f"Results cache: {EXPERIMENT_LOG_PATH}")
print("="*80)

PHASED RETRIEVAL EXPERIMENT FRAMEWORK
Validation questions per config: 100
Random seed: 42
Results cache: ./results/grid_search_results.csv


In [None]:
def build_cfg(base: dict, override: dict) -> RetrievalConfig:
    """Build RetrievalConfig from base + override dictionaries."""
    return RetrievalConfig(**{**base, **override})


def generate_config_key(cfg: RetrievalConfig) -> str:
    """Generate a unique, human-readable key for a retrieval configuration."""
    method_part = (
        f"QLD_mu{cfg.mu}"
        if cfg.method == "qld"
        else f"BM25_k1{cfg.k1}_b{cfg.b}"
    )

    return (
        f"{cfg.method.upper()}_"
        f"kdocs{cfg.k_docs}_"
        f"kpass{cfg.k_passages}_"
        f"{'RERANK' if cfg.use_rerank else 'NORERANK'}_"
        f"{method_part}_"
        f"win{cfg.window}_ovl{cfg.overlap}"
    )


def save_results_to_csv(result: dict, config_key: str, path: str) -> None:
    """Append experiment results to CSV."""
    os.makedirs(os.path.dirname(path), exist_ok=True)

    row = {
        "config_key": config_key,
        "f1": result["f1_score"],
        "precision": result["precision"],
        "recall": result["recall"],
        "exact_match": result["exact_match"],
        "num_questions": result["num_questions"],
    }

    df_row = pd.DataFrame([row])

    if not os.path.exists(path):
        df_row.to_csv(path, index=False)
    else:
        df_row.to_csv(path, mode="a", header=False, index=False)


def load_completed_configs(path: str) -> set:
    """Load already evaluated configuration keys."""
    if not os.path.exists(path):
        return set()

    df = pd.read_csv(path)
    return set(df["config_key"].tolist())


In [48]:
validation_data = df_val.sample(
    n=EXPERIMENT_QUESTIONS,
    random_state=EXPERIMENT_SEED
).reset_index(drop=True)

completed_configs = load_completed_configs(EXPERIMENT_LOG_PATH)
print(f"Completed configs loaded: {len(completed_configs)}")

Completed configs loaded: 34


### Phase 1: Retrieval Method

Goal: Tune QLD μ and BM25 k1/b independently
Reranking disabled

In [39]:
PHASE_1_BASE = {
    "k_docs": 5,
    "k_passages": 3,
    "window": 150,
    "overlap": 50,
}

PHASE_1_METHODS = (
    [{"method": "qld", "mu": mu} for mu in [500, 1000, 2000]] +
    [{"method": "bm25", "k1": k1, "b": b} for (k1, b) in itertools.product([0.6, 0.9, 1.2], [0.4, 0.75])]
)

# Build all Phase-1 configs
phase1_configs = [
    build_cfg(PHASE_1_BASE, {**params, "use_rerank": rerank})
    for params in PHASE_1_METHODS
    for rerank in [True, False]
]

print(f"Phase 1 total configs: {len(phase1_configs)}")

# Filter already completed
phase1_pending = [
    cfg for cfg in phase1_configs
    if generate_config_key(cfg) not in completed_configs
]

print(f"Phase 1 pending configs: {len(phase1_pending)}")
print("-" * 80)

# Run experiments
for idx, cfg in enumerate(phase1_pending, start=1):
    key = generate_config_key(cfg)
    print(f"[{idx}/{len(phase1_pending)}] Running: {key}")

    result = run_experiment(
        name=key,
        df_data=validation_data,
        retrieval_config=cfg,
        prompt_config=test_prompt,
        verbose=True
    )

    save_results_to_csv(result, key, EXPERIMENT_LOG_PATH)
    print(f"✓ F1={result['f1_score']:.2f}")


Phase 1 total configs: 18
Phase 1 pending configs: 0
--------------------------------------------------------------------------------


In [None]:
best_phase_1_configs = extract_best_configs_from_phase(phase1_configs)

BEST_PHASE_1_CONFIGS = [
    {'method': 'qld', 'mu': 1000, 'use_rerank': True},
    {'method': 'bm25', 'k1': 1.2, 'b': 0.4, 'use_rerank': True},
]

### Phase 2: Passage Segmentation
Goal: tune window / overlap with best retrieval params

In [41]:
PHASE_2_BASE = {
    "k_docs": 5,
    "k_passages": 3,
}

PHASE_2_PASSAGES = [
    {"window": w, "overlap": o}
    for w, o in itertools.product([100, 150, 200], [30, 50])
]

# Build Phase-2 configs
phase2_configs = [
    build_cfg(PHASE_2_BASE, {**method_cfg, **passage_cfg})
    for method_cfg in BEST_PHASE_1_CONFIGS
    for passage_cfg in PHASE_2_PASSAGES
]

print(f"Phase 2 total configs: {len(phase2_configs)}")

# Filter already completed configs
phase2_pending = [
    cfg for cfg in phase2_configs
    if generate_config_key(cfg) not in completed_configs
]

print(f"Phase 2 pending configs: {len(phase2_pending)}")
print("-" * 80)

# Run Phase-2 experiments
for idx, cfg in enumerate(phase2_pending, start=1):
    key = generate_config_key(cfg)
    print(f"[{idx}/{len(phase2_pending)}] Running: {key}")

    result = run_experiment(
        name=key,
        df_data=validation_data,
        retrieval_config=cfg,
        prompt_config=test_prompt,
        verbose=True
    )

    save_results_to_csv(result, key, EXPERIMENT_LOG_PATH)
    print(f"✓ F1={result['f1_score']:.2f}")


Phase 2 total configs: 12
Phase 2 pending configs: 9
--------------------------------------------------------------------------------
[1/9] Running: QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win100_ovl50


QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win100_ovl50: 100%|██████████| 13/13 [01:26<00:00,  6.62s/it]



QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win100_ovl50
   Retrieval: QLD(mu=1000) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=13.15 | P=14.10 | R=20.07 | EM=4.00
   Questions: 100

✓ F1=13.15
[2/9] Running: QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win150_ovl30


QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win150_ovl30: 100%|██████████| 13/13 [01:08<00:00,  5.29s/it]



QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win150_ovl30
   Retrieval: QLD(mu=1000) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=13.68 | P=13.49 | R=21.62 | EM=5.00
   Questions: 100

✓ F1=13.68
[3/9] Running: QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win200_ovl30


QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win200_ovl30: 100%|██████████| 13/13 [01:19<00:00,  6.11s/it]



QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win200_ovl30
   Retrieval: QLD(mu=1000) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=16.79 | P=17.12 | R=27.65 | EM=9.00
   Questions: 100

✓ F1=16.79
[4/9] Running: QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win200_ovl50


QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win200_ovl50: 100%|██████████| 13/13 [01:26<00:00,  6.68s/it]



QLD_kdocs5_kpass3_RERANK_QLD_mu1000_win200_ovl50
   Retrieval: QLD(mu=1000) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=19.57 | P=18.85 | R=33.80 | EM=7.00
   Questions: 100

✓ F1=19.57
[5/9] Running: BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win100_ovl30


BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win100_ovl30: 100%|██████████| 13/13 [01:06<00:00,  5.08s/it]



BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win100_ovl30
   Retrieval: BM25(k1=1.2, b=0.4) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=16.59 | P=17.10 | R=25.68 | EM=5.00
   Questions: 100

✓ F1=16.59
[6/9] Running: BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win100_ovl50


BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win100_ovl50: 100%|██████████| 13/13 [01:13<00:00,  5.67s/it]



BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win100_ovl50
   Retrieval: BM25(k1=1.2, b=0.4) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=14.53 | P=14.88 | R=21.57 | EM=7.00
   Questions: 100

✓ F1=14.53
[7/9] Running: BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win150_ovl30


BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win150_ovl30: 100%|██████████| 13/13 [01:02<00:00,  4.77s/it]



BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win150_ovl30
   Retrieval: BM25(k1=1.2, b=0.4) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=11.57 | P=12.56 | R=16.02 | EM=6.00
   Questions: 100

✓ F1=11.57
[8/9] Running: BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win200_ovl30


BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win200_ovl30: 100%|██████████| 13/13 [01:10<00:00,  5.44s/it]



BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win200_ovl30
   Retrieval: BM25(k1=1.2, b=0.4) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=19.72 | P=20.19 | R=32.98 | EM=10.00
   Questions: 100

✓ F1=19.72
[9/9] Running: BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win200_ovl50


BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win200_ovl50: 100%|██████████| 13/13 [01:12<00:00,  5.60s/it]


BM25_kdocs5_kpass3_RERANK_BM25_k11.2_b0.4_win200_ovl50
   Retrieval: BM25(k1=1.2, b=0.4) → BiEncoder | k_docs=5, k_passages=3
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1=18.16 | P=17.95 | R=29.68 | EM=9.00
   Questions: 100

✓ F1=18.16





In [43]:
BEST_PHASE_2_CONFIGS = [
 {'method': 'qld', 'mu': 1000, 'use_rerank': True, 'window': 200, 'overlap': 50},
 {'method': 'bm25', 'k1': 1.2, 'b': 0.4, 'use_rerank': True, 'window': 200, 'overlap': 30}
]

### Phase 3 — k_docs / k_passages Tradeoff
Goal: tune recall vs precision tradeoff

In [None]:
PHASE_3_K = [
    {"k_docs": 3, "k_passages": 2},
    {"k_docs": 5, "k_passages": 3}, # baseline
    {"k_docs": 8, "k_passages": 3},
    {"k_docs": 10, "k_passages": 5},
    {"k_docs": 15, "k_passages": 5},
    {"k_docs": 20, "k_passages": 7},
]

# Build Phase-3 configs
phase3_configs = [
    build_cfg({}, {**method_cfg, **k_cfg})
    for method_cfg in BEST_PHASE_2_CONFIGS
    for k_cfg in PHASE_3_K
]

print(f"Phase 3 total configs: {len(phase3_configs)}")

# Filter completed configs
phase3_pending = [
    cfg for cfg in phase3_configs
    if generate_config_key(cfg) not in completed_configs
]

print(f"Phase 3 pending configs: {len(phase3_pending)}")
print("-" * 80)

# Run Phase-3 experiments
for idx, cfg in enumerate(phase3_pending, start=1):
    key = generate_config_key(cfg)
    print(f"[{idx}/{len(phase3_pending)}] Running: {key}")

    result = run_experiment(
        name=key,
        df_data=validation_data,
        retrieval_config=cfg,
        prompt_config=test_prompt,
        verbose=True
    )

    save_results_to_csv(result, key, EXPERIMENT_LOG_PATH)
    print(f"✓ F1={result['f1_score']:.2f}")


Phase 3 total configs: 12
Phase 3 pending configs: 4
--------------------------------------------------------------------------------
[1/4] Running: QLD_kdocs15_kpass5_RERANK_QLD_mu1000_win200_ovl50


QLD_kdocs15_kpass5_RERANK_QLD_mu1000_win200_ovl50:  31%|███       | 4/13 [00:38<01:26,  9.66s/it]