# Test RAGAs Evaluation

This notebook provides a minimal test setup for RAGAs evaluation.
Assumes OpenSearch is already set up and data is loaded from baseline_rag_benchmark.ipynb.

## Features
- Small dataset subset for quick testing
- Cache clearing for rapid iterations
- Detailed error reporting

In [None]:
import os
import sys
import json
import importlib
import asyncio
from pathlib import Path
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

# Add project root to path
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.notebook_utils.dataset_utils import load_labeled_dataset
from utils.notebook_utils.importable import notebook_to_module

In [None]:
# Clear module cache to pick up changes
def clear_module_cache():
    """Clear imported module cache to pick up changes"""
    modules_to_clear = [
        m for m in sys.modules
        if m.startswith('utils.') or 
           m.startswith('rag_implementations.')
    ]
    for module in modules_to_clear:
        del sys.modules[module]
    print(f"Cleared {len(modules_to_clear)} modules from cache")

clear_module_cache()

In [None]:
# Load small subset of data
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_TEST_SAMPLES = 3  # Small subset for testing

print("Loading dataset...")
dataset, documents = load_labeled_dataset(DATASET_DIR)
test_examples = dataset.examples[:NUM_TEST_SAMPLES]
print(f"Using {len(test_examples)} test examples")

In [None]:
# Import RAG implementation (assumes OpenSearch is set up)
implementation_path = str(project_root / 'rag_implementations/baseline_rag/implementation.ipynb')
BaselineRAG = notebook_to_module(implementation_path).BaselineRAG

# Initialize RAG
rag = BaselineRAG(index_name="origincovid19dataset-benchmark")

In [None]:
# Generate answers and collect contexts
print("Generating answers...")
questions = []
answers = []
contexts = []
ground_truths = []

for example in test_examples:
    print(f"\nQuery: {example.query}")
    result = rag.query(example.query)
    
    questions.append(example.query)
    answers.append(result['response'])
    contexts.append([doc['content'] for doc in result['context']])
    ground_truths.append([example.reference_answer])  # RAGAs expects list of lists
    
    print(f"Retrieved {len(result['context'])} context documents")
    print(f"Answer: {result['response'][:100]}...")

In [None]:
# Create RAGAs dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert to dataset
eval_dataset = Dataset.from_dict(data)
print("Dataset structure:")
print(eval_dataset)

In [None]:
# Run evaluation
print("Running evaluation...")
try:
    results = evaluate(
        dataset=eval_dataset,
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy
        ]
    )
    
    # Convert to pandas for better display
    df = results.to_pandas()
    print("\nResults:")
    print(df)
    
except Exception as e:
    print(f"Error during evaluation: {type(e).__name__}")
    print(f"Error details: {str(e)}")
    print("\nDataset contents:")
    for key, value in data.items():
        print(f"\n{key}:")
        print(f"Type: {type(value)}")
        print(f"Length: {len(value)}")
        print(f"First item: {value[0][:100]}...")
    raise