In [None]:
import sys
sys.path.insert(0, '../src')

from data_loader import HotpotQALoader
from rag_pipeline import VectorStore, LLMGenerator, RAGPipeline
from evaluator import Evaluator
from utils import set_random_seed, load_config

import pandas as pd
from tqdm.auto import tqdm

## 1. Setup

In [None]:
# Load configuration
config = load_config('../config/config.yaml')
set_random_seed(config['seed'])

print("Configuration loaded")
print(f"Model: {config['model']['name']}")
print(f"Subset size: {config['dataset']['subset_size']}")

## 2. Load Data

In [None]:
# Load prepared subset
loader = HotpotQALoader(subset_size=config['dataset']['subset_size'])
loader.load_subset('../data/hotpotqa_subset.json')

print(f"Loaded {len(loader.subset)} examples")

## 3. Build Retrieval Index

In [None]:
# Load corpus
import json
with open('../data/corpus.json', 'r') as f:
    corpus = json.load(f)

print(f"Loaded corpus with {len(corpus)} passages")

In [None]:
# Build vector store
vector_store = VectorStore(encoder_model=config['retrieval']['encoder_model'])
vector_store.build_index(corpus)

## 4. Test Retrieval

In [None]:
# Test retrieval on a sample question
test_question = loader.subset[0]['question']
print(f"Question: {test_question}")

retrieved, scores = vector_store.retrieve(test_question, k=3)

print(f"\nTop 3 retrieved passages:")
for i, (passage, score) in enumerate(zip(retrieved, scores), 1):
    print(f"\n{i}. [Score: {score:.3f}] {passage['title']}")
    print(f"   {passage['text'][:200]}...")

## 5. Load LLM

In [None]:
# Load language model
# WARNING: This may take several minutes and requires significant memory

generator = LLMGenerator(
    model_name=config['model']['name'],
    device=config['model']['device'],
    load_in_8bit=config['model']['load_in_8bit']
)

## 6. Create RAG Pipeline

In [None]:
# Create pipeline
rag_pipeline = RAGPipeline(vector_store, generator)

print("RAG pipeline ready")

## 7. Test Single Example

In [None]:
# Test on one example
test_example = loader.preprocess_example(loader.subset[0])
question = test_example['question']

print(f"Question: {question}")
print(f"Ground truth: {test_example['answer']}")

# No RAG
result_no_rag = rag_pipeline.answer_without_rag(
    question,
    temperature=config['model']['temperature'],
    max_new_tokens=config['model']['max_new_tokens']
)
print(f"\nNo-RAG answer: {result_no_rag['answer']}")

# With RAG (k=3)
result_rag = rag_pipeline.answer_with_rag(
    question,
    k=3,
    temperature=config['model']['temperature'],
    max_new_tokens=config['model']['max_new_tokens']
)
print(f"\nRAG k=3 answer: {result_rag['answer']}")

## 8. Run Full Experiment

**Note:** This will take significant time. Consider running on a subset first.

In [None]:
# Run on small subset for testing (adjust as needed)
TEST_SIZE = 10  # Set to len(loader.subset) for full run

evaluator = Evaluator()

generation_kwargs = {
    'temperature': config['model']['temperature'],
    'max_new_tokens': config['model']['max_new_tokens'],
    'seed': config['seed']
}

for i in tqdm(range(TEST_SIZE), desc="Processing examples"):
    example = loader.preprocess_example(loader.subset[i])
    question = example['question']
    
    # No-RAG baseline
    no_rag_result = rag_pipeline.answer_without_rag(question, **generation_kwargs)
    evaluator.evaluate_single(example, no_rag_result, condition='no_rag')
    
    # RAG with different k values
    for k in config['retrieval']['k_values']:
        rag_result = rag_pipeline.answer_with_rag(question, k=k, **generation_kwargs)
        evaluator.evaluate_single(example, rag_result, condition=f'rag_k{k}')

print("\nExperiment completed!")

## 9. View Results

In [None]:
# Get results dataframe
results_df = evaluator.get_results_df()
results_df.head(10)

In [None]:
# Aggregate results by condition
for condition in ['no_rag', 'rag_k1', 'rag_k3', 'rag_k5']:
    metrics = evaluator.aggregate_results(condition=condition)
    print(f"\n{condition}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.3f}")

In [None]:
# Statistical comparison
comparison = evaluator.compare_conditions('no_rag', 'rag_k3')

print("\nNo-RAG vs RAG k=3:")
print(f"EM difference: {comparison['em_diff']:.3f}")
print(f"F1 difference: {comparison['f1_diff']:.3f}")
print(f"Hallucination reduction: {comparison['hallucination_diff']:.3f}")
print(f"McNemar p-value: {comparison['mcnemar_p_value']:.4f}")
print(f"Significant: {comparison['mcnemar_p_value'] < 0.05}")

## 10. Save Results

In [None]:
# Save results
evaluator.save_results('../results/evaluation_results.csv')

print("Results saved to results/evaluation_results.csv")

## Next Steps

- Run full experiment (set TEST_SIZE = len(loader.subset))
- Visualize results in notebook 03
- Perform error analysis
- Manual hallucination annotation