## 1. Setup & Dependencies

In [1]:
import pandas as pd
import json
import re
import string
from collections import Counter
from typing import List, Dict, Tuple, Optional, Literal
from dataclasses import dataclass
from tqdm.auto import tqdm
import itertools
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')

print("✓ Dependencies imported")

✓ Dependencies imported


  from .autonotebook import tqdm as notebook_tqdm


### Install Required Packages

In [2]:
# !pip install torch torchvision torchaudio
# !pip install pyserini==0.36.0
# !pip install accelerate
# !pip install transformers
# !pip install tqdm
# !pip install python-dotenv

In [3]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

!java -version

openjdk version "21.0.9" 2025-10-21
OpenJDK Runtime Environment (build 21.0.9+10-Ubuntu-122.04)
OpenJDK 64-Bit Server VM (build 21.0.9+10-Ubuntu-122.04, mixed mode, sharing)


In [4]:
# !pip install torch torchvision torchaudio
# !pip install pyserini==0.36.0
# !pip install accelerate
# !pip install transformers
# !pip install tqdm
# !pip install python-dotenv

### Hugging Face Authentication

In [5]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

login(os.getenv('HUGGING_FACE_TOKEN'))
print("✓ Logged into Hugging Face")

✓ Logged into Hugging Face


## 2. Data Loading & Preparation

In [6]:
# Load datasets
df_train = pd.read_csv("./data/train.csv", converters={"answers": json.loads})
df_test = pd.read_csv("./data/test.csv")

print(f"Train set: {len(df_train)} questions")
print(f"Test set: {len(df_test)} questions")
print(f"\nSample question: {df_train.iloc[0]['question']}")
print(f"Sample answers: {df_train.iloc[0]['answers']}")

Train set: 3778 questions
Test set: 2032 questions

Sample question: what is the name of justin bieber brother?
Sample answers: ['Jazmyn Bieber', 'Jaxon Bieber']


## 3. Retrieval Functions

In [7]:
from pyserini.search import SimpleSearcher
from pyserini.index.lucene import IndexReader

# Load Pyserini index
print("Loading Pyserini index...")
searcher = SimpleSearcher.from_prebuilt_index('wikipedia-kilt-doc')
index_reader = IndexReader.from_prebuilt_index('wikipedia-kilt-doc')

print(f"✓ Index loaded: {index_reader.stats()['documents']} documents")

[0;93m2025-12-14 07:55:53.864584185 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m


Loading Pyserini index...


Dec 14, 2025 7:55:54 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


SimpleSearcher class has been deprecated, please use LuceneSearcher from pyserini.search.lucene instead
✓ Index loaded: 5903530 documents


In [8]:
from sentence_transformers import SentenceTransformer
import torch

# Load bi-encoder
print("Loading bi-encoder...")
device = "cuda" if torch.cuda.is_available() else "cpu"
bi_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

print("✓ Bi-encoder loaded")

Loading bi-encoder...
✓ Bi-encoder loaded


In [None]:
from dataclasses import dataclass
from typing import List
from functools import lru_cache
import json
import torch
from sentence_transformers import util

@lru_cache(maxsize=1000)
def get_doc_content(docid: str) -> str:
    """Cache document content extraction."""
    try:
        doc = searcher.doc(docid)
        return json.loads(doc.raw()).get("contents", "").replace("\n", " ")
    except Exception:
        return ""
    
@dataclass
class RetrievalManager:
    """
    Manages passage-based hybrid retrieval with RRF fusion and bi-encoder reranking.
    """
    k_docs: int = 10
    k_passages: int = 5
    method: Literal['qld', 'bm25'] = 'hybrid'
    mu: int = 1000           # QLD smoothing
    k1: float = 0.9          # BM25
    b: float = 0.4           # BM25
    window: int = 150
    overlap: int = 50
    min_passage_words: int = 30
    
    def __str__(self):
        return (
            f"Retrieval(RRF_k={self.rrf_k}, μ={self.mu}, k1={self.k1}, b={self.b}) | "
            f"k_docs={self.k_docs}, k_passages={self.k_passages} | "
            f"window={self.window}, overlap={self.overlap}"
        )
    
    def extract_passages(self, text: str) -> List[str]:
        """Split text into overlapping word windows."""
        if not text:
            return []
        words = text.split()
        if len(words) < self.min_passage_words:
            return []
        
        step = max(1, self.window - self.overlap)
        passages = []
        
        for i in range(0, len(words), step):
            chunk = words[i:i + self.window]
            if len(chunk) < self.min_passage_words:
                break
            passages.append(" ".join(chunk))
        
        return passages
    
    def rerank(self, query: str, passages: List[str]) -> List[str]:
        """Rerank passages using bi-encoder with deduplication."""
        if not passages:
            return []
        
        # Deduplicate exact matches
        seen = set()
        unique_passages = []
        for p in passages:
            if p not in seen:
                seen.add(p)
                unique_passages.append(p)
        
        if not unique_passages:
            return []
        
        # Bi-encoder reranking
        q_emb = bi_encoder.encode(query, convert_to_tensor=True, device=device)
        p_embs = bi_encoder.encode(unique_passages, convert_to_tensor=True, device=device)
        scores = util.cos_sim(q_emb, p_embs).squeeze(0)
        
        top_k = min(self.k_passages, len(unique_passages))
        idx = torch.topk(scores, k=top_k).indices.tolist()
        
        return [unique_passages[i] for i in idx]
    
    def retrieve_context(self, query: str) -> List[str]:
        """
        Retrieve passages using Reciprocal Rank Fusion (RRF) + bi-encoder reranking.
        """
        scores = {}
        
        # QLD retrieval
        searcher.set_qld(self.mu)
        qld_hits = searcher.search(query, self.k_docs)
        for rank, hit in enumerate(qld_hits):
            scores[hit.docid] = scores.get(hit.docid, 0.0) + 1.0 / (self.rrf_k + rank + 1)
        
        # BM25 retrieval
        searcher.set_bm25(self.k1, self.b)
        bm25_hits = searcher.search(query, self.k_docs)
        for rank, hit in enumerate(bm25_hits):
            scores[hit.docid] = scores.get(hit.docid, 0.0) + 1.0 / (self.rrf_k + rank + 1)
        
        # Top-k documents by RRF score
        top_docids = sorted(scores, key=scores.get, reverse=True)[:self.k_docs]
        
        # Extract passages from top documents
        passages = []
        for docid in top_docids:
            content = get_doc_content(docid)
            if content:
                passages.extend(self.extract_passages(content))
        
        return self.rerank(query, passages)


# Test the RetrievalManager
query = "Who wrote Harry Potter?"
    
rm = RetrievalManager()

print(rm)
passages = rm.retrieve_context(query)

for i, p in enumerate(passages, 1):
    print(f"{i}. {p[:100]}...")

Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=5 | window=150, overlap=50
1. the film, has denied that Rowling ever saw it before writing her book. Rowling has said on record mu...
2. to resolve an ongoing feud between the organisation's northern and southern branches that had sapped...
3. Harry Potter Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The...
4. Harry Potter (character) Harry James Potter is the titular protagonist of J. K. Rowling's "Harry Pot...
5. by Emily Brontë, "Charlie and the Chocolate Factory" by Roald Dahl, "Robinson Crusoe" by Daniel Defo...


## 4. LLM Generation

In [10]:
import transformers
import torch
import logging

# Suppress transformers warnings
transformers.logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)

print("Loading LLM model...")
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.float16},
    device_map="auto"
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Set pad_token for batch processing
pipeline.tokenizer.pad_token = pipeline.tokenizer.eos_token

print(f"✓ Model loaded on: {'GPU' if torch.cuda.is_available() else 'CPU'}")

Loading LLM model...
✓ Model loaded on: GPU


In [11]:
DEFAULT_SYSTEM_PROMPT = (
    "You must respond based strictly on the information in provided passages."
    "Do not incorporate any external knowledge or infer any details beyond what is given."
    "If the answer is not in the context, return 'I dont know'."
    "Do not include explanations, only the final answer!"
)

DEFAULT_USER_PROMPT = (
    "Based on the following documents, provide a concise answer to the question.\n\n"
    "{context}\n\n"
    "Question: {question}\n\n"
    "Answer:"
)

@dataclass
class PromptManager:
    """Manages prompt generation and LLM answer generation."""
    system_prompt: str = DEFAULT_SYSTEM_PROMPT
    user_prompt: str = DEFAULT_USER_PROMPT
    temperature: float = 0.0
    top_p: float = 1.0
    max_new_tokens: int = 256
    do_sample: bool = False
    prompt_id: str = "default"  # For later use in prompt tuning
    
    def __str__(self):
        return f"temp={self.temperature}, top_p={self.top_p}, max_tokens={self.max_new_tokens}"

    @staticmethod
    def clean_answer(answer: str) -> str:
        """Clean and standardize the generated answer."""
        answer = re.sub(r'^(Answer|The answer is|Based on the .*?,):?\s*', '', answer, flags=re.I)
        answer = answer.rstrip('.')
        if any(phrase in answer.lower() for phrase in ["dont know", "don't know", "do not know", "unknown"]):
            return "unknown"
        return answer.strip()

    def create_messages(self, question: str, contexts: List[str]) -> List[Dict]:
        """Create messages for the LLM based on the question and contexts."""
        if not contexts:
            context_str = "No relevant documents found."
        else:
            context_str = '\n\n'.join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(contexts)])
        
        return [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": self.user_prompt.format(context=context_str, question=question)}
        ]

    def generate_answer(self, question: str, contexts: List[str]) -> str:
        """Generate an answer using the LLM based on the question and contexts."""
        messages = self.create_messages(question, contexts)
        
        outputs = pipeline(
            messages,
            max_new_tokens=self.max_new_tokens,
            eos_token_id=terminators,
            do_sample=self.do_sample,
            temperature=self.temperature,
            top_p=self.top_p,
        )
        
        answer = outputs[0]["generated_text"][-1].get('content', '')
        return self.clean_answer(answer)

    def batch_generate_answers(self, questions: List[str], contexts_list: List[List[str]]) -> List[str]:
        """Generate answers for multiple questions in batch."""
        # Create messages for all questions
        batch_messages = [self.create_messages(q, ctx) for q, ctx in zip(questions, contexts_list)]
        
        # Process batch through pipeline
        outputs = pipeline(
            batch_messages,
            max_new_tokens=self.max_new_tokens,
            eos_token_id=terminators,
            do_sample=self.do_sample,
            temperature=self.temperature,
            top_p=self.top_p
        )
        
        # Extract and clean answers
        answers = []
        for output in outputs:
            answer = output[0]["generated_text"][-1].get('content', '')
            answers.append(self.clean_answer(answer))
        
        return answers


# Test the PromptManager
test_prompt_manager = PromptManager()
print(f"Testing: {test_prompt_manager}")
test_answer = test_prompt_manager.generate_answer(query, passages)
print(f"✓ Generated answer: '{test_answer}'")

Testing: temp=0.0, top_p=1.0, max_tokens=256
✓ Generated answer: 'J.K. Rowling'


## 5. Evaluation Metrics

In [12]:
def normalize_answer(s: str) -> str:
    """Normalize answer for comparison"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    
    def white_space_fix(text):
        return ' '.join(text.split())
    
    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    
    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_token_metrics(prediction: str, ground_truth: str) -> Tuple[float, float, float]:
    """
    Compute precision, recall, and F1 score for token-level comparison.
    Returns: (precision, recall, f1)
    """
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    # Handle empty cases
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        match = int(pred_tokens == gt_tokens)
        return match, match, match
    
    # Compute overlap
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0.0, 0.0, 0.0
    
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return precision, recall, f1


def evaluate_predictions(df_gold: pd.DataFrame, predictions: Dict[int, str]) -> Dict:
    """Evaluate predictions against ground truth."""
    f1_scores = []
    precision_scores = []
    recall_scores = []
    exact_matches = []
    
    for _, row in df_gold.iterrows():
        qid = row['id']
        
        # Handle missing predictions
        if qid not in predictions:
            f1_scores.append(0.0)
            precision_scores.append(0.0)
            recall_scores.append(0.0)
            exact_matches.append(0)
            continue
        
        prediction = predictions[qid]
        ground_truths = row['answers']
        
        # Normalize once
        norm_prediction = normalize_answer(prediction)
        
        # Find best match across all ground truths
        best_f1 = 0.0
        best_precision = 0.0
        best_recall = 0.0
        is_exact = 0
        
        for gt in ground_truths:
            norm_gt = normalize_answer(gt)
            
            # Compute metrics
            prec, rec, f1 = compute_token_metrics(prediction, gt)
            
            # Track best scores
            if f1 > best_f1:
                best_f1 = f1
                best_precision = prec
                best_recall = rec
            
            # Check exact match
            if norm_prediction == norm_gt:
                is_exact = 1
        
        f1_scores.append(best_f1)
        precision_scores.append(best_precision)
        recall_scores.append(best_recall)
        exact_matches.append(is_exact)
    
    return {
        'f1': 100.0 * sum(f1_scores) / len(f1_scores) if f1_scores else 0.0,
        'precision': 100.0 * sum(precision_scores) / len(precision_scores) if precision_scores else 0.0,
        'recall': 100.0 * sum(recall_scores) / len(recall_scores) if recall_scores else 0.0,
        'exact_match': 100.0 * sum(exact_matches) / len(exact_matches) if exact_matches else 0.0,
        'f1_scores': f1_scores,
        'precision_scores': precision_scores,
        'recall_scores': recall_scores,
        'exact_matches': exact_matches
    }


# Test evaluation
test_predictions = {1: "J.K. Rowling", 2: "Paris", 3: "Shakespeare"}
test_gold = pd.DataFrame({
    'id': [1, 2, 3],
    'answers': [["J.K. Rowling", "Rowling"], ["Earth"], ["William Shakespeare", "Shakespeare"]]
})

test_metrics = evaluate_predictions(test_gold, test_predictions)
print(f"✓ Evaluation test: F1={test_metrics['f1']:.2f}, P={test_metrics['precision']:.2f}, R={test_metrics['recall']:.2f}, EM={test_metrics['exact_match']:.2f}")

✓ Evaluation test: F1=66.67, P=66.67, R=66.67, EM=66.67


## 6. Experiment Framework

In [13]:
def run_experiment(
    name: str,
    df_data: pd.DataFrame,
    retrieval_manager: RetrievalManager,
    prompt_manager: PromptManager,
    max_questions: Optional[int] = None,
    batch_size: int = 4,
    verbose: bool = True
) -> Dict:
    if max_questions:
        df_data = df_data.head(max_questions)
    
    predictions = {}
    
    # Process in batches
    num_batches = (len(df_data) + batch_size - 1) // batch_size
    iterator = tqdm(range(num_batches), desc=name) if verbose else range(num_batches)
    
    for batch_idx in iterator:
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(df_data))
        batch_df = df_data.iloc[start_idx:end_idx]
        
        # Retrieve contexts for all questions in batch
        batch_questions = []
        batch_qids = []
        batch_contexts = []
        
        for _, row in batch_df.iterrows():
            question = row['question']
            qid = row['id']
            contexts = retrieval_manager.retrieve_context(question)
            
            batch_questions.append(question)
            batch_qids.append(qid)
            batch_contexts.append(contexts)
        
        # Generate answers in batch
        batch_answers = prompt_manager.batch_generate_answers(batch_questions, batch_contexts)
        
        # Store predictions
        for qid, answer in zip(batch_qids, batch_answers):
            predictions[qid] = answer
    
    metrics = evaluate_predictions(df_data, predictions)
    
    result = {
        'name': name,
        'retrieval': retrieval_manager,
        'prompt': prompt_manager,
        'f1_score': metrics['f1'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'exact_match': metrics['exact_match'],
        'num_questions': len(df_data),
        'predictions': predictions,
        'f1_scores': metrics['f1_scores'],
        'precision_scores': metrics['precision_scores'],
        'recall_scores': metrics['recall_scores'],
        'exact_matches': metrics['exact_matches']
    }
    
    if verbose:
        print(f"\n{name}")
        print(f"   Retrieval: {retrieval_manager}")
        print(f"   Prompt: {prompt_manager}")
        print(f"   F1={metrics['f1']:.2f} | P={metrics['precision']:.2f} | R={metrics['recall']:.2f} | EM={metrics['exact_match']:.2f}")
        print(f"   Questions: {len(df_data)}\n")
    
    return result

# Test experiment
test_retrieval = RetrievalManager()
test_prompt = PromptManager()
print(f"Testing experiment with:")
print(f"  Retrieval: {test_retrieval}")
print(f"  Prompt: {test_prompt}")

test_exp = run_experiment(
    "Quick Test",
    df_train.head(5),
    test_retrieval,
    test_prompt,
    verbose=True
)

print(f"✓ Experiment framework ready")

Testing experiment with:
  Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=5 | window=150, overlap=50
  Prompt: temp=0.0, top_p=1.0, max_tokens=256


Quick Test: 100%|██████████| 2/2 [00:05<00:00,  2.88s/it]


Quick Test
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=5 | window=150, overlap=50
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=22.86 | P=21.67 | R=30.00 | EM=20.00
   Questions: 5

✓ Experiment framework ready





## 7. Experiments

### Experiments global config

In [14]:
EXPERIMENT_SEED = 42
EXPERIMENT_QUESTIONS = 100

EXPERIMENT_LOG_PATH = (
    f"./results/grid_search_results_q{EXPERIMENT_QUESTIONS}.csv"
)

validation_data = df_train.sample(
    n=EXPERIMENT_QUESTIONS,
    random_state=EXPERIMENT_SEED
).reset_index(drop=True)

print("=" * 80)
print("PHASED RETRIEVAL + GENERATION EXPERIMENT FRAMEWORK")
print("=" * 80)
print(f"Validation questions per config: {EXPERIMENT_QUESTIONS}")
print(f"Random seed: {EXPERIMENT_SEED}")
print(f"Results cache: {EXPERIMENT_LOG_PATH}")
print("=" * 80)


PHASED RETRIEVAL + GENERATION EXPERIMENT FRAMEWORK
Validation questions per config: 100
Random seed: 42
Results cache: ./results/grid_search_results_q100.csv


### Experiments utils

In [15]:
def build_retrieval_manager(base: dict, override: dict) -> RetrievalManager:
    """Build RetrievalManager safely."""
    return RetrievalManager(**{**base, **override})


def generate_config_key(
    retrieval_mgr: RetrievalManager,
    prompt_mgr: PromptManager,
) -> str:
    """Generate unique config key for RRF-based retrieval."""
    return (
        f"RRF_k{retrieval_mgr.rrf_k}_"
        f"mu{retrieval_mgr.mu}_"
        f"k1{retrieval_mgr.k1}_b{retrieval_mgr.b}_"
        f"kdocs{retrieval_mgr.k_docs}_"
        f"kpass{retrieval_mgr.k_passages}_"
        f"win{retrieval_mgr.window}_ovl{retrieval_mgr.overlap}_"
        f"prompt{prompt_mgr.prompt_id}"
    )


def save_results_to_csv(result: dict, key: str, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)

    row = {
        "config_key": key,
        "f1": result["f1_score"],
        "precision": result["precision"],
        "recall": result["recall"],
        "exact_match": result["exact_match"],
        "num_questions": result["num_questions"],
    }

    df = pd.DataFrame([row])
    if not os.path.exists(path):
        df.to_csv(path, index=False)
    else:
        df.to_csv(path, mode="a", header=False, index=False)


def load_completed_configs(path: str) -> set[str]:
    if not os.path.exists(path):
        return set()
    return set(pd.read_csv(path)["config_key"])


### Best-config selector

In [16]:
def select_top_k_configs(
    retrieval_managers: list[RetrievalManager],
    prompt_managers: list[PromptManager],
    *,
    top_k: int = 5,
):
    """
    Return the top-K configurations by validation F1 score.
    Always sorted by descending F1.
    """
    df = pd.read_csv(EXPERIMENT_LOG_PATH)

    scored_entries = []

    for r_mgr, p_mgr in zip(retrieval_managers, prompt_managers):
        key = generate_config_key(r_mgr, p_mgr)
        row = df[df["config_key"] == key]
        if row.empty:
            continue

        scored_entries.append({
            "retrieval_mgr": r_mgr,
            "prompt_mgr": p_mgr,
            "f1": float(row.iloc[0]["f1"]),
            "config_key": key,
        })

    # Sort before slicing
    scored_entries.sort(
        key=lambda x: (x["f1"], x["config_key"]),
        reverse=True,
    )

    return scored_entries[:top_k]


### Phase runner

In [17]:
def run_phase(
    *,
    phase_name: str,
    grid: list[dict],
    validation_data,
    top_k: int | None = None,
):
    """
    Run a single experiment phase.

    Each grid item must contain:
      - retrieval_mgr: RetrievalManager
      - prompt_mgr: PromptManager

    Returns:
      - top-K configs sorted by F1 (if top_k is provided)
      - otherwise, the full grid sorted by F1
    """
    print("\n" + "=" * 80)
    print(phase_name)
    print("=" * 80)

    completed = load_completed_configs(EXPERIMENT_LOG_PATH)

    pending = [
        g for g in grid
        if generate_config_key(g["retrieval_mgr"], g["prompt_mgr"]) not in completed
    ]

    print(f"Total configs: {len(grid)}")
    print(f"Completed configs: {len(grid) - len(pending)}")
    print(f"Pending configs: {len(pending)}")
    print("-" * 80)

    for i, entry in enumerate(pending, start=1):
        retrieval_mgr = entry["retrieval_mgr"]
        prompt_mgr = entry["prompt_mgr"]

        key = generate_config_key(retrieval_mgr, prompt_mgr)
        print(f"[{i}/{len(pending)}] Running: {key}")

        result = run_experiment(
            name=key,
            df_data=validation_data,
            retrieval_manager=retrieval_mgr,
            prompt_manager=prompt_mgr,
            verbose=True,
        )

        save_results_to_csv(result, key, EXPERIMENT_LOG_PATH)
        print(f"✓ F1={result['f1_score']:.4f}")

    # Load results once for consistent sorting
    df = pd.read_csv(EXPERIMENT_LOG_PATH)

    def get_f1(entry):
        key = generate_config_key(entry["retrieval_mgr"], entry["prompt_mgr"])
        row = df[df["config_key"] == key]
        return float(row.iloc[0]["f1"]) if not row.empty else -1.0

    # Sort full grid by F1
    sorted_grid = sorted(
        grid,
        key=lambda g: (get_f1(g), generate_config_key(g["retrieval_mgr"], g["prompt_mgr"])),
        reverse=True,
    )

    if top_k is None:
        return sorted_grid

    top_configs = select_top_k_configs(
        [g["retrieval_mgr"] for g in sorted_grid],
        [g["prompt_mgr"] for g in sorted_grid],
        top_k=top_k,
    )

    print("\nTop configs selected:")
    for i, entry in enumerate(top_configs, 1):
        print(
            f"{i}. {entry['config_key']} | "
            f"F1={entry['f1']:.4f}"
        )

    return top_configs


In [18]:
# ============================================================
# PHASE 1 — Passage Segmentation
# ============================================================

PHASE_1_GRID = []

BASE_RETRIEVAL_PARAMS = {
    "k_docs": 10,       
    "k_passages": 5,
    "mu": 1000,
    "k1": 0.9,
    "b": 0.4,
}

for window in [100, 150, 200, 250]:
    for overlap in [25, 50, 75]:
        PHASE_1_GRID.append({
            "retrieval_mgr": RetrievalManager(
                window=window,
                overlap=overlap,
                **BASE_RETRIEVAL_PARAMS,
            ),
            "prompt_mgr": PromptManager(),
        })

print(f"✓ Phase 1 grid size: {len(PHASE_1_GRID)}")

PHASE_1_TOP_CONFIGS = run_phase(
    phase_name="PHASE 1 — Passage Segmentation",
    grid=PHASE_1_GRID,
    validation_data=validation_data,
    top_k=5,
)


✓ Phase 1 grid size: 12

PHASE 1 — Passage Segmentation
Total configs: 12
Completed configs: 12
Pending configs: 0
--------------------------------------------------------------------------------

Top configs selected:
1. RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass5_win150_ovl50_promptdefault | F1=22.5510
2. RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass5_win250_ovl50_promptdefault | F1=21.7384
3. RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass5_win200_ovl25_promptdefault | F1=20.4083
4. RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass5_win250_ovl75_promptdefault | F1=20.1787
5. RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass5_win150_ovl75_promptdefault | F1=18.9896


In [19]:
# ============================================================
# PHASE 2 — Retrieval Capacity
# ============================================================

PHASE_2_GRID = []

for entry in PHASE_1_TOP_CONFIGS:
    base_mgr = entry["retrieval_mgr"]

    for k_docs in [5, 10, 15, 20]:
        for k_passages in [3, 5, 7]:
            PHASE_2_GRID.append({
                "retrieval_mgr": RetrievalManager(
                    k_docs=k_docs,
                    k_passages=k_passages,
                    window=base_mgr.window,
                    overlap=base_mgr.overlap,
                    mu=base_mgr.mu,
                    k1=base_mgr.k1,
                    b=base_mgr.b,
                ),
                "prompt_mgr": PromptManager(),
            })

print(f"✓ Phase 2 grid size: {len(PHASE_2_GRID)}")

PHASE_2_TOP_CONFIGS = run_phase(
    phase_name="PHASE 2 — Retrieval Capacity",
    grid=PHASE_2_GRID,
    validation_data=validation_data,
    top_k=3,
)


✓ Phase 2 grid size: 60

PHASE 2 — Retrieval Capacity
Total configs: 60
Completed configs: 36
Pending configs: 24
--------------------------------------------------------------------------------
[1/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win200_ovl25_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win200_ovl25_promptdefault: 100%|██████████| 25/25 [02:26<00:00,  5.87s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win200_ovl25_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=5 | window=200, overlap=25
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=19.61 | P=20.17 | R=29.02 | EM=7.00
   Questions: 100

✓ F1=19.6068
[2/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win200_ovl25_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win200_ovl25_promptdefault: 100%|██████████| 25/25 [02:42<00:00,  6.50s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win200_ovl25_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=7 | window=200, overlap=25
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=18.11 | P=17.89 | R=28.29 | EM=7.00
   Questions: 100

✓ F1=18.1083
[3/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass3_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass3_win250_ovl75_promptdefault: 100%|██████████| 25/25 [01:10<00:00,  2.84s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass3_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=5, k_passages=3 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=16.14 | P=16.36 | R=23.55 | EM=4.00
   Questions: 100

✓ F1=16.1444
[4/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass5_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass5_win250_ovl75_promptdefault: 100%|██████████| 25/25 [01:32<00:00,  3.69s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass5_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=5, k_passages=5 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=18.93 | P=18.49 | R=31.82 | EM=6.00
   Questions: 100

✓ F1=18.9258
[5/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass7_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass7_win250_ovl75_promptdefault: 100%|██████████| 25/25 [01:38<00:00,  3.94s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass7_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=5, k_passages=7 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=9.49 | P=9.58 | R=15.00 | EM=4.00
   Questions: 100

✓ F1=9.4924
[6/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass3_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass3_win250_ovl75_promptdefault: 100%|██████████| 25/25 [01:38<00:00,  3.93s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass3_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=3 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=17.38 | P=17.69 | R=26.55 | EM=4.00
   Questions: 100

✓ F1=17.3849
[7/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass7_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass7_win250_ovl75_promptdefault: 100%|██████████| 25/25 [01:54<00:00,  4.59s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass7_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=7 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=12.39 | P=12.40 | R=17.70 | EM=6.00
   Questions: 100

✓ F1=12.3879
[8/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass3_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass3_win250_ovl75_promptdefault: 100%|██████████| 25/25 [01:57<00:00,  4.70s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass3_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=15, k_passages=3 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=17.86 | P=18.33 | R=27.55 | EM=4.00
   Questions: 100

✓ F1=17.8576
[9/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win250_ovl75_promptdefault: 100%|██████████| 25/25 [02:12<00:00,  5.32s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=15, k_passages=5 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=18.97 | P=19.82 | R=28.55 | EM=7.00
   Questions: 100

✓ F1=18.9684
[10/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass7_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass7_win250_ovl75_promptdefault: 100%|██████████| 25/25 [02:14<00:00,  5.40s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass7_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=15, k_passages=7 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=13.97 | P=14.42 | R=19.93 | EM=5.00
   Questions: 100

✓ F1=13.9743
[11/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass3_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass3_win250_ovl75_promptdefault: 100%|██████████| 25/25 [02:13<00:00,  5.34s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass3_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=3 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=19.36 | P=19.49 | R=30.30 | EM=5.00
   Questions: 100

✓ F1=19.3639
[12/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win250_ovl75_promptdefault: 100%|██████████| 25/25 [02:30<00:00,  6.03s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=5 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=20.00 | P=20.70 | R=29.80 | EM=8.00
   Questions: 100

✓ F1=20.0019
[13/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win250_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win250_ovl75_promptdefault: 100%|██████████| 25/25 [02:37<00:00,  6.29s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win250_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=7 | window=250, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=14.14 | P=14.01 | R=21.43 | EM=6.00
   Questions: 100

✓ F1=14.1372
[14/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass3_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass3_win150_ovl75_promptdefault: 100%|██████████| 25/25 [01:32<00:00,  3.71s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass3_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=5, k_passages=3 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=12.08 | P=11.89 | R=19.70 | EM=5.00
   Questions: 100

✓ F1=12.0797
[15/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass5_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass5_win150_ovl75_promptdefault: 100%|██████████| 25/25 [01:49<00:00,  4.36s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass5_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=5, k_passages=5 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=17.16 | P=17.76 | R=25.38 | EM=7.00
   Questions: 100

✓ F1=17.1553
[16/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass7_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass7_win150_ovl75_promptdefault: 100%|██████████| 25/25 [01:54<00:00,  4.58s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs5_kpass7_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=5, k_passages=7 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=18.03 | P=17.92 | R=27.84 | EM=7.00
   Questions: 100

✓ F1=18.0282
[17/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass3_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass3_win150_ovl75_promptdefault: 100%|██████████| 25/25 [02:16<00:00,  5.48s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass3_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=3 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=12.57 | P=12.66 | R=20.63 | EM=5.00
   Questions: 100

✓ F1=12.5740
[18/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass7_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass7_win150_ovl75_promptdefault: 100%|██████████| 25/25 [02:33<00:00,  6.15s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass7_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=10, k_passages=7 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=19.68 | P=20.65 | R=27.29 | EM=8.00
   Questions: 100

✓ F1=19.6778
[19/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass3_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass3_win150_ovl75_promptdefault: 100%|██████████| 25/25 [02:55<00:00,  7.01s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass3_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=15, k_passages=3 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=12.06 | P=12.42 | R=18.38 | EM=4.00
   Questions: 100

✓ F1=12.0572
[20/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win150_ovl75_promptdefault: 100%|██████████| 25/25 [03:08<00:00,  7.56s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=15, k_passages=5 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=15.28 | P=16.50 | R=23.23 | EM=5.00
   Questions: 100

✓ F1=15.2842
[21/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass7_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass7_win150_ovl75_promptdefault: 100%|██████████| 25/25 [03:17<00:00,  7.90s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass7_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=15, k_passages=7 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=19.73 | P=21.46 | R=28.37 | EM=8.00
   Questions: 100

✓ F1=19.7349
[22/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass3_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass3_win150_ovl75_promptdefault: 100%|██████████| 25/25 [03:30<00:00,  8.43s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass3_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=3 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=14.40 | P=15.05 | R=21.63 | EM=4.00
   Questions: 100

✓ F1=14.3962
[23/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win150_ovl75_promptdefault: 100%|██████████| 25/25 [03:46<00:00,  9.04s/it]



RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass5_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=5 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=16.08 | P=16.88 | R=24.56 | EM=5.00
   Questions: 100

✓ F1=16.0772
[24/24] Running: RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win150_ovl75_promptdefault


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win150_ovl75_promptdefault: 100%|██████████| 25/25 [03:58<00:00,  9.53s/it]


RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win150_ovl75_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=1000, k1=0.9, b=0.4) | k_docs=20, k_passages=7 | window=150, overlap=75
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=16.30 | P=17.23 | R=24.87 | EM=7.00
   Questions: 100

✓ F1=16.3039

Top configs selected:
1. RRF_k60_mu1000_k10.9_b0.4_kdocs15_kpass5_win150_ovl50_promptdefault | F1=22.7270
2. RRF_k60_mu1000_k10.9_b0.4_kdocs20_kpass7_win150_ovl50_promptdefault | F1=22.6114
3. RRF_k60_mu1000_k10.9_b0.4_kdocs10_kpass5_win150_ovl50_promptdefault | F1=22.5510





In [20]:
# ============================================================
# PHASE 3 — Lexical Hyperparameters
# ============================================================

PHASE_3_GRID = []

BM25_PARAMS = [
    {"k1": 0.6, "b": 0.3},
    {"k1": 0.9, "b": 0.4},   # baseline
    {"k1": 1.2, "b": 0.6},
]

QLD_PARAMS = [
    {"mu": 500},
    {"mu": 1000},           # baseline
    {"mu": 2000},
]

for entry in PHASE_2_TOP_CONFIGS:
    base_mgr = entry["retrieval_mgr"]

    for bm25 in BM25_PARAMS:
        for qld in QLD_PARAMS:
            PHASE_3_GRID.append({
                "retrieval_mgr": RetrievalManager(
                    k_docs=base_mgr.k_docs,
                    k_passages=base_mgr.k_passages,
                    window=base_mgr.window,
                    overlap=base_mgr.overlap,
                    k1=bm25["k1"],
                    b=bm25["b"],
                    mu=qld["mu"],
                ),
                "prompt_mgr": PromptManager(),
            })

print(f"✓ Phase 3 grid size: {len(PHASE_3_GRID)}")

PHASE_3_TOP_CONFIGS = run_phase(
    phase_name="PHASE 3 — Lexical Hyperparameters",
    grid=PHASE_3_GRID,
    validation_data=validation_data,
    top_k=1,
)


✓ Phase 3 grid size: 27

PHASE 3 — Lexical Hyperparameters
Total configs: 27
Completed configs: 3
Pending configs: 24
--------------------------------------------------------------------------------
[1/24] Running: RRF_k60_mu500_k10.6_b0.3_kdocs15_kpass5_win150_ovl50_promptdefault


RRF_k60_mu500_k10.6_b0.3_kdocs15_kpass5_win150_ovl50_promptdefault:   0%|          | 0/25 [00:00<?, ?it/s]

RRF_k60_mu500_k10.6_b0.3_kdocs15_kpass5_win150_ovl50_promptdefault: 100%|██████████| 25/25 [02:30<00:00,  6.01s/it]



RRF_k60_mu500_k10.6_b0.3_kdocs15_kpass5_win150_ovl50_promptdefault
   Retrieval: Retrieval(RRF_k=60, μ=500, k1=0.6, b=0.3) | k_docs=15, k_passages=5 | window=150, overlap=50
   Prompt: temp=0.0, top_p=1.0, max_tokens=256
   F1=19.09 | P=19.57 | R=29.17 | EM=7.00
   Questions: 100

✓ F1=19.0933
[2/24] Running: RRF_k60_mu1000_k10.6_b0.3_kdocs15_kpass5_win150_ovl50_promptdefault


RRF_k60_mu1000_k10.6_b0.3_kdocs15_kpass5_win150_ovl50_promptdefault:  40%|████      | 10/25 [01:22<02:03,  8.22s/it]


KeyboardInterrupt: 

In [None]:
# ============================================================
# FINAL SELECTION — Top 3 configs on 1,000 TRAIN questions
# ============================================================

FINAL_SELECTION_SEED = 123
FINAL_SELECTION_SIZE = 1000

# Take top-3 configs from Phase 3 (already sorted by F1)
TOP_3_CONFIGS = PHASE_3_TOP_CONFIGS[:3]

final_validation_data = df_train.sample(
    n=FINAL_SELECTION_SIZE,
    random_state=FINAL_SELECTION_SEED,
).reset_index(drop=True)

print("=" * 80)
print("FINAL MODEL SELECTION ON 1,000 TRAIN QUESTIONS")
print("=" * 80)

FINAL_SELECTION_RESULTS = []

for i, entry in enumerate(TOP_3_CONFIGS, 1):
    retrieval_mgr = entry["retrieval_mgr"]

    prompt_mgr = PromptManager(
        system_prompt=DEFAULT_SYSTEM_PROMPT,
        user_prompt=DEFAULT_USER_PROMPT,
        temperature=0.0,
        do_sample=False,
        top_p=1.0,
        prompt_id="final",
    )

    config_key = generate_config_key(retrieval_mgr, prompt_mgr)

    print(f"\n[{i}/3] Evaluating config: {config_key}")

    result = run_experiment(
        name=f"{config_key}_final_select",
        df_data=final_validation_data,
        retrieval_manager=retrieval_mgr,
        prompt_manager=prompt_mgr,
        verbose=False,
    )

    print(
        f"✓ F1={result['f1_score']:.4f} | "
        f"EM={result['exact_match']:.4f} | "
        f"P={result['precision']:.4f} | "
        f"R={result['recall']:.4f}"
    )

    FINAL_SELECTION_RESULTS.append({
        "retrieval_mgr": retrieval_mgr,
        "prompt_mgr": prompt_mgr,
        "config_key": config_key,
        **result,
    })

# Select the best config by F1
FINAL_SELECTION_RESULTS.sort(
    key=lambda x: x["f1_score"],
    reverse=True,
)

BEST_FINAL_CONFIG = FINAL_SELECTION_RESULTS[0]

print("\n" + "=" * 80)
print("✓ BEST FINAL CONFIG SELECTED")
print("=" * 80)
print(
    f"{BEST_FINAL_CONFIG['config_key']} | "
    f"F1={BEST_FINAL_CONFIG['f1_score']:.4f}"
)


In [None]:
# ============================================================
# KAGGLE SUBMISSION — Final system on TEST set
# ============================================================

BEST_RETRIEVAL_MGR = BEST_FINAL_CONFIG["retrieval_mgr"]
BEST_PROMPT_MGR = BEST_FINAL_CONFIG["prompt_mgr"]

FINAL_CONFIG_KEY = BEST_FINAL_CONFIG["config_key"]

print("=" * 80)
print("KAGGLE SUBMISSION GENERATION")
print("=" * 80)
print(f"Using final config: {FINAL_CONFIG_KEY}")
print("=" * 80)

# Run inference only (no labels needed)
test_questions = df_test["question"].tolist()

print(f"Generating answers for {len(test_questions)} test questions...")

test_contexts = [
    BEST_RETRIEVAL_MGR.retrieve_context(q)
    for q in test_questions
]

test_answers = BEST_PROMPT_MGR.batch_generate_answers(
    questions=test_questions,
    contexts_list=test_contexts,
)

# Build Kaggle submission file
submission_df = pd.DataFrame({
    "id": df_test["id"],
    "answer": test_answers,
})

SUBMISSION_PATH = "./results/kaggle_submission.csv"
submission_df.to_csv(SUBMISSION_PATH, index=False)

print(f"✓ Kaggle submission file saved to: {SUBMISSION_PATH}")
print(f"✓ Total rows: {len(submission_df)}")
