## 1. Setup & Dependencies

In [1]:
import pandas as pd
import json
import re
import string
from collections import Counter
from typing import List, Dict, Tuple, Optional, Literal
from dataclasses import dataclass
from tqdm.auto import tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')

print("‚úì Dependencies imported")

‚úì Dependencies imported


  from .autonotebook import tqdm as notebook_tqdm


### Install Required Packages

In [2]:
# !pip install torch torchvision torchaudio
# !pip install pyserini==0.36.0
# !pip install accelerate
# !pip install transformers
# !pip install tqdm
# !pip install semantic-text-splitter
# !pip install python-dotenv

### Hugging Face Authentication

In [3]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

login(os.getenv('HUGGING_FACE_TOKEN'))
print("‚úì Logged into Hugging Face")

‚úì Logged into Hugging Face


## 2. Data Loading & Preparation

In [4]:
# Load datasets
df_train = pd.read_csv("./data/train.csv", converters={"answers": json.loads})
df_test = pd.read_csv("./data/test.csv")

print(f"Train set: {len(df_train)} questions")
print(f"Test set: {len(df_test)} questions")
print(f"\nSample question: {df_train.iloc[0]['question']}")
print(f"Sample answers: {df_train.iloc[0]['answers']}")

Train set: 3778 questions
Test set: 2032 questions

Sample question: what is the name of justin bieber brother?
Sample answers: ['Jazmyn Bieber', 'Jaxon Bieber']


In [5]:
# Create train/validation split for experiments
RANDOM_SEED = 42
VAL_SIZE = 0.2

df_train_split = df_train.sample(frac=1-VAL_SIZE, random_state=RANDOM_SEED)
df_val = df_train.drop(df_train_split.index).reset_index(drop=True)
df_train_split = df_train_split.reset_index(drop=True)

print(f"‚úì Split data:")
print(f"   Training: {len(df_train_split)} questions")
print(f"   Validation: {len(df_val)} questions")

‚úì Split data:
   Training: 3022 questions
   Validation: 756 questions


## 3. Retrieval Functions

In [6]:
from pyserini.search import SimpleSearcher
from pyserini.index.lucene import IndexReader

# Load Pyserini index
print("Loading Pyserini index...")
searcher = SimpleSearcher.from_prebuilt_index('wikipedia-kilt-doc')
index_reader = IndexReader.from_prebuilt_index('wikipedia-kilt-doc')

print(f"‚úì Index loaded: {index_reader.stats()['documents']} documents")

[0;93m2025-12-12 10:55:57.460178756 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card0/device/vendor"[m


Loading Pyserini index...


Dec 12, 2025 10:55:58 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


SimpleSearcher class has been deprecated, please use LuceneSearcher from pyserini.search.lucene instead
‚úì Index loaded: 5903530 documents


In [7]:
@dataclass
class RetrievalConfig:
    """Configuration for document/passage retrieval."""
    k: int = 5
    method: Literal['bm25', 'rm3', 'qld'] = 'qld'
    mu: int = 1000
    k1: float = 0.9
    b: float = 0.4
    rm3_terms: int = 10
    rm3_docs: int = 10
    rm3_weight: float = 0.5
    use_passages: bool = False
    window: int = 150
    overlap: int = 50
    min_passage_words: int = 30
    max_passages: int = 10
    max_chars: int = 300
    
    def __str__(self):
        method_params = ""
        if self.method == 'qld':
            method_params = f"mu={self.mu}"
        elif self.method == 'bm25':
            method_params = f"k1={self.k1}, b={self.b}"
        elif self.method == 'rm3':
            method_params = f"terms={self.rm3_terms}, docs={self.rm3_docs}"
        
        if self.use_passages:
            mode_info = f"passages: window={self.window}, overlap={self.overlap}, max={self.max_passages}"
        else:
            mode_info = f"docs: {self.max_chars}chars"
        
        return f"{self.method.upper()}(k={self.k}, {method_params}) | {mode_info}"

def extract_passages(text, window=150, overlap=50, min_words=30):
    """Extract overlapping passages from text."""
    if not text:
        return []
    
    words = text.split()
    if len(words) < min_words:
        return []
    
    passages = []
    step = max(1, window - overlap)
    
    for start in range(0, len(words), step):
        chunk = words[start:start + window]
        
        if len(chunk) < min_words:
            if passages:
                passages[-1] += " " + " ".join(chunk)
            else:
                passages.append(" ".join(chunk))
            break
        
        passages.append(" ".join(chunk))
    
    return passages

def retrieve_context(query: str, config: RetrievalConfig) -> List[str]:
    """Retrieve documents or passages for a given query based on the retrieval configuration."""
    if config.method == 'bm25':
        searcher.set_bm25(config.k1, config.b)
    elif config.method == 'rm3':
        searcher.set_rm3(config.rm3_terms, config.rm3_docs, config.rm3_weight)
    else:
        searcher.set_qld(config.mu)
    
    hits = searcher.search(query, config.k)
    
    contexts = []
    for hit in hits:
        try:
            doc = searcher.doc(hit.docid)
            data = json.loads(doc.raw())
            content = data['contents'].replace('\n', ' ')
            
            if config.use_passages:
                passages = extract_passages(content, config.window, config.overlap, config.min_passage_words)
                contexts.extend(passages)
            else:
                contexts.append(content[:config.max_chars])
        except:
            continue
    
    return contexts[:config.max_passages] if config.use_passages else contexts

query = "Who wrote Harry Potter?"

test_config_docs = RetrievalConfig(k=3, method='qld', mu=1000)
print(f"Testing: {test_config_docs}")
test_docs = retrieve_context(query, test_config_docs)
print(f"‚úì Retrieved {len(test_docs)} documents")
for doc in test_docs:
    print(f"  - {doc[:100]}...")
print()

test_config_passages = RetrievalConfig(k=3, method='qld', mu=1000,
                                       use_passages=True, window=150, overlap=50, max_passages=8)
print(f"Testing: {test_config_passages}")
test_passages = retrieve_context(query, test_config_passages)
print(f"‚úì Retrieved {len(test_passages)} passages")
for passage in test_passages:
    print(f"  - {passage[:100]}...")

Testing: QLD(k=3, mu=1000) | docs: 300chars
‚úì Retrieved 3 documents
  - Harry Potter Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The...
  - Bonnie Wright Bonnie Francesca Wright (born 17 February 1991) is an English actress, film director, ...
  - Politics of Harry Potter There are many published theories about the politics of the Harry Potter bo...

Testing: QLD(k=3, mu=1000) | passages: window=150, overlap=50, max=8
‚úì Retrieved 8 passages
  - Harry Potter Harry Potter is a series of fantasy novels written by British author J. K. Rowling. The...
  - June 1997, the books have found immense popularity, critical acclaim and commercial success worldwid...
  - English by two major publishers, Bloomsbury in the United Kingdom and Scholastic Press in the United...
  - media franchises of all time. A series of many genres, including fantasy, drama, coming of age, and ...
  - in 2012, a digital platform on which J.K. Rowling updates the series with n

## 4. LLM Generation

In [19]:
import transformers
import torch
import logging

# Suppress transformers warnings
transformers.logging.set_verbosity_error()
logging.getLogger("transformers").setLevel(logging.ERROR)

print("Loading LLM model...")
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

print(f"‚úì Model loaded on: {'GPU' if torch.cuda.is_available() else 'CPU'}")

Loading LLM model...
‚úì Model loaded on: GPU


In [13]:
DEFAULT_SYSTEM_PROMPT = (
    "You must respond based strictly on the information in provided passages."
    "Do not incorporate any external knowledge or infer any details beyond what is given."
    "If the answer is not in the context, return 'I dont know'."
    "Do not include explanations, only the final answer!"
)

DEFAULT_USER_PROMPT = (
    "Based on the following documents, provide a concise answer to the question.\n\n"
    "{context}\n\n"
    "Question: {question}\n\n"
    "Answer:"
)

@dataclass
class PromptConfig:
    """Configuration for prompt generation and LLM parameters."""
    system_prompt: str = DEFAULT_SYSTEM_PROMPT
    user_prompt: str = DEFAULT_USER_PROMPT
    temperature: float = 0.6
    top_p: float = 0.9
    max_new_tokens: int = 256
    do_sample: bool = True
    
    def __str__(self):
        return f"temp={self.temperature}, top_p={self.top_p}, max_tokens={self.max_new_tokens}"

def clean_answer(answer: str) -> str:
    """Clean and standardize the generated answer."""
    answer = re.sub(r'^(Answer|The answer is|Based on the .*?,):?\s*', '', answer, flags=re.I)
    answer = answer.rstrip('.')
    if any(phrase in answer.lower() for phrase in ["dont know", "don't know", "do not know", "unknown"]):
        return "unknown"
    return answer.strip()

def create_messages(question: str, contexts: List[str], config: PromptConfig) -> List[Dict]:
    """Create messages for the LLM based on the question, contexts, and prompt configuration."""
    if not contexts:
        context_str = "No relevant documents found."
    else:
        context_str = '\n\n'.join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(contexts)])
    
    return [
        {"role": "system", "content": config.system_prompt},
        {"role": "user", "content": config.user_prompt.format(context=context_str, question=question)}
    ]

def generate_answer(question: str, contexts: List[str], config: PromptConfig) -> str:
    """Generate an answer using the LLM based on the question, contexts, and prompt configuration."""
    messages = create_messages(question, contexts, config)
    
    outputs = pipeline(
        messages,
        max_new_tokens=config.max_new_tokens,
        eos_token_id=terminators,
        do_sample=config.do_sample,
        temperature=config.temperature,
        top_p=config.top_p,
    )
    
    answer = outputs[0]["generated_text"][-1].get('content', '')
    return clean_answer(answer)

test_prompt_config = PromptConfig(temperature=0.1)
print(f"Testing: {test_prompt_config}")
test_answer = generate_answer(query, test_passages, test_prompt_config)
print(f"‚úì Generated answer: '{test_answer}'")

Testing: temp=0.1, top_p=0.9, max_tokens=256
‚úì Generated answer: 'J. K. Rowling'


## 5. Evaluation Metrics

In [14]:
def normalize_answer(s: str) -> str:
    """Normalize answer for comparison"""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    
    def white_space_fix(text):
        return ' '.join(text.split())
    
    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    
    def lower(text):
        return text.lower()
    
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction: str, ground_truth: str) -> float:
    """Compute token-level F1 score"""
    pred_tokens = normalize_answer(prediction).split()
    gt_tokens = normalize_answer(ground_truth).split()
    
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return int(pred_tokens == gt_tokens)
    if num_same == 0:
        return 0
    
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def evaluate_predictions(df_gold: pd.DataFrame, predictions: Dict[int, str]) -> Tuple[float, List[float]]:
    """
    Evaluate predictions against ground truth.
    
    Returns:
        (average_f1_score, list_of_individual_f1_scores)
    """
    scores = []
    
    for _, row in df_gold.iterrows():
        qid = row['id']
        if qid not in predictions:
            scores.append(0.0)
            continue
        
        prediction = predictions[qid]
        ground_truths = row['answers']
        
        # Get max F1 over all ground truths
        max_f1 = max(f1_score(prediction, gt) for gt in ground_truths)
        scores.append(max_f1)
    
    avg_score = 100.0 * sum(scores) / len(scores) if scores else 0.0
    return avg_score, scores


# Test evaluation
test_predictions = {1: "J.K. Rowling", 2: "Paris", 3: "Shakespeare"}
test_gold = pd.DataFrame({
    'id': [1, 2, 3],
    'answers': [["J.K. Rowling", "Rowling"], ["Earth"], ["William Shakespeare", "Shakespeare"]]
})
test_score, _ = evaluate_predictions(test_gold, test_predictions)
print(f"‚úì Evaluation test: F1 = {test_score:.2f}")

‚úì Evaluation test: F1 = 66.67


## 6. Experiment Framework

In [15]:
def run_experiment(
    name: str,
    df_data: pd.DataFrame,
    retrieval_config: RetrievalConfig,
    prompt_config: PromptConfig,
    max_questions: Optional[int] = None,
    verbose: bool = True
) -> Dict:
    if max_questions:
        df_data = df_data.head(max_questions)
    
    predictions = {}
    iterator = tqdm(df_data.iterrows(), total=len(df_data), desc=name) if verbose else df_data.iterrows()
    
    for _, row in iterator:
        question = row['question']
        qid = row['id']
        
        contexts = retrieve_context(question, retrieval_config)
        answer = generate_answer(question, contexts, prompt_config)
        predictions[qid] = answer
    
    f1, individual_scores = evaluate_predictions(df_data, predictions)
    
    result = {
        'name': name,
        'retrieval': retrieval_config,
        'prompt': prompt_config,
        'f1_score': f1,
        'num_questions': len(df_data),
        'predictions': predictions,
        'individual_scores': individual_scores
    }
    
    if verbose:
        print(f"\n{name}")
        print(f"   Retrieval: {retrieval_config}")
        print(f"   Prompt: {prompt_config}")
        print(f"   F1 Score: {f1:.2f}")
        print(f"   Questions: {len(df_data)}\n")
    
    return result

test_retrieval = RetrievalConfig(k=3, method='qld')
test_prompt = PromptConfig(temperature=0.1)
print(f"Testing experiment with:")
print(f"  Retrieval: {test_retrieval}")
print(f"  Prompt: {test_prompt}")

test_exp = run_experiment(
    "Quick Test",
    df_val.head(5),
    test_retrieval,
    test_prompt,
    verbose=True
)

print(f"‚úì Experiment framework ready")

Testing experiment with:
  Retrieval: QLD(k=3, mu=1000) | docs: 300chars
  Prompt: temp=0.1, top_p=0.9, max_tokens=256


Quick Test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:14<00:00,  2.85s/it]


Quick Test
   Retrieval: QLD(k=3, mu=1000) | docs: 300chars
   Prompt: temp=0.1, top_p=0.9, max_tokens=256
   F1 Score: 14.67
   Questions: 5

‚úì Experiment framework ready





## 7. Run Experiments

Now we can quickly test different configurations!

### Quick Test (10 questions)

### Experiment 1: Baseline (QL Dirichlet)

In [18]:
exp1_baseline = run_experiment(
    name="Baseline - QLD Docs",
    df_data=df_val,
    retrieval_config=RetrievalConfig(k=5, method='qld', mu=1000, max_chars=300),
    prompt_config=PromptConfig(temperature=0.6, top_p=0.9),
    max_questions=100
)

Baseline - QLD Docs:   6%|‚ñå         | 6/100 [00:24<06:18,  4.03s/it]


KeyboardInterrupt: 

### Experiment 2: Try BM25

In [None]:
exp2_bm25 = run_experiment(
    name="BM25 Docs",
    df_data=df_val,
    retrieval_config=RetrievalConfig(k=5, method='bm25', k1=0.9, b=0.4, max_chars=300),
    prompt_config=PromptConfig(temperature=0.6, top_p=0.9),
    max_questions=100
)

### Experiment 3: RM3 Query Expansion

In [None]:
exp3_rm3 = run_experiment(
    name="RM3 Query Expansion Docs",
    df_data=df_val,
    retrieval_config=RetrievalConfig(
        k=5, 
        method='rm3', 
        rm3_terms=10, 
        rm3_docs=10,
        rm3_weight=0.5,
        max_chars=300
    ),
    prompt_config=PromptConfig(temperature=0.6, top_p=0.9),
    max_questions=100
)

### Experiment 4: QLD with Passages

In [None]:
exp4_qld_passages = run_experiment(
    name="QLD Passages",
    df_data=df_val,
    retrieval_config=RetrievalConfig(
        k=5, 
        method='qld', 
        mu=1000,

        use_passages=True,)

        window=150,    max_questions=100

        overlap=50,    prompt_config=PromptConfig(temperature=0.6, top_p=0.9),

        max_passages=8    ),

### Experiment 5: BM25 with Passages

In [None]:
exp5_bm25_passages = run_experiment(
    name="BM25 Passages",
    df_data=df_val,
    retrieval_config=RetrievalConfig(
        k=5,
        method='bm25',
        k1=0.9,

        b=0.4,)

        use_passages=True,    max_questions=100

        window=150,    prompt_config=PromptConfig(temperature=0.6, top_p=0.9),

        overlap=50,    ),
        max_passages=8

### Experiment 6: More Documents (k=10)

In [None]:
exp6_more_docs = run_experiment(
    name="More Docs (k=10)",
    df_data=df_val,
    retrieval_config=RetrievalConfig(k=10, method='qld', mu=1000, max_chars=300),
    prompt_config=PromptConfig(temperature=0.6, top_p=0.9),
    max_questions=100
)

## 8. Results Analysis

In [None]:
# Collect all experiments
experiments = [
    exp1_baseline,
    exp2_bm25,
    exp3_rm3,
    exp4_qld_passages,
    exp5_bm25_passages,
    exp6_more_docs
]

# Create comparison DataFrame
results_df = pd.DataFrame([
    {
        'Experiment': exp['name'],
        'F1 Score': exp['f1_score'],
        'Retrieval': str(exp['retrieval']),
        'Prompt': f"temp={exp['prompt'].temperature}"
    }
    for exp in experiments
])

results_df = results_df.sort_values('F1 Score', ascending=False).reset_index(drop=True)
print("\n" + "="*80)
print("üìä EXPERIMENT RESULTS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

# Find best configuration
best_exp = experiments[results_df.index[0]]
print(f"\nüèÜ BEST CONFIGURATION:")

print(f"   Name: {best_exp['name']}")print(f"   F1 Score: {best_exp['f1_score']:.2f}")

print(f"   Retrieval: {best_exp['retrieval']}")print(f"   Prompt: {best_exp['prompt']}")

## 9. Generate Submission with Best Config

In [None]:
# Use the best configuration to generate predictions for test set
print("üöÄ Generating predictions for test set with best configuration...")

test_predictions = run_experiment(
    name="Final Test Submission",
    df_data=df_test,
    retrieval_config=best_exp['retrieval'],
    prompt_config=best_exp['prompt'],
    max_questions=None,  # Use all test questions
    verbose=True
)

In [None]:
# Create submission file
submission_df = pd.DataFrame([
    {'id': qid, 'prediction': json.dumps([pred], ensure_ascii=False)}
    for qid, pred in test_predictions['predictions'].items()
])

# Save to CSV
output_path = "../data/submission_best_config.csv"
submission_df.to_csv(output_path, index=False)

print(f"‚úì Submission saved to: {output_path}")
print(f"üìä Total predictions: {len(submission_df)}")
print(f"\nSample predictions:")
print(submission_df.head())

## 10. Error Analysis (Optional)

In [None]:
# Analyze errors from validation set
def analyze_errors(result: Dict, df_gold: pd.DataFrame, top_n: int = 10):
    """Show worst performing questions"""
    error_analysis = []
    
    for idx, row in df_gold.iterrows():
        qid = row['id']
        if qid in result['predictions']:
            score = result['individual_scores'][idx]
            error_analysis.append({
                'id': qid,
                'question': row['question'],
                'prediction': result['predictions'][qid],
                'ground_truth': row['answers'],
                'f1_score': score
            })
    
    error_df = pd.DataFrame(error_analysis).sort_values('f1_score')
    
    print(f"\n{'='*80}")
    print(f"‚ùå WORST {top_n} PREDICTIONS")
    print(f"{'='*80}")
    
    for idx, row in error_df.head(top_n).iterrows():
        print(f"\nQ: {row['question']}")
        print(f"Predicted: {row['prediction']}")
        print(f"Expected: {row['ground_truth']}")
        print(f"F1: {row['f1_score']:.2f}")
        print("-" * 80)
    
    return error_df

# Analyze best experiment
error_df = analyze_errors(best_exp, df_val, top_n=5)

---
## üéØ Next Steps

1. **Review error analysis** to understand failure modes
2. **Try custom configurations** by creating new RetrievalConfig/PromptConfig objects
3. **Combine best strategies** (e.g., RM3 + lower temperature + longer context)
4. **Submit to Kaggle** and compare with leaderboard scores
5. **Iterate based on results**

Happy experimenting! üöÄ