# Test RAGAs Evaluation

This notebook provides a minimal test setup for RAGAs evaluation.

## Features
- Small dataset subset for quick testing
- Cache clearing for rapid iterations
- Detailed error reporting

In [None]:
import os
import sys
import json
import importlib
import asyncio
from pathlib import Path
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    Faithfulness,
    AnswerRelevancy,
    ContextRecall,
    ContextPrecision,
)
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3

# Add project root to path
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.aws.opensearch_utils import OpenSearchManager
from utils.notebook_utils.dataset_utils import load_labeled_dataset
from utils.notebook_utils.importable import notebook_to_module

In [None]:
# Clear module cache to pick up changes
def clear_module_cache():
    """Clear imported module cache to pick up changes"""
    modules_to_clear = [
        m for m in sys.modules
        if m.startswith('utils.') or 
           m.startswith('rag_implementations.')
    ]
    for module in modules_to_clear:
        del sys.modules[module]
    print(f"Cleared {len(modules_to_clear)} modules from cache")

clear_module_cache()

In [None]:
# OpenSearch Setup
OPENSEARCH_DOMAIN = "baseline-rag-benchmark-store"
INDEX_NAME = "originofcovid19dataset-benchmark"  # Fixed index name

print("Setting up OpenSearch...")
manager = OpenSearchManager(
    domain_name=OPENSEARCH_DOMAIN,
    cleanup_enabled=False,  # Don't clean up, we want to keep the index
    verbose=False
)

# Get endpoint and set environment variable
endpoint = manager.setup_domain()
os.environ['OPENSEARCH_HOST'] = endpoint
print(f"OpenSearch endpoint: {endpoint}")

# Create OpenSearch client
region = boto3.Session().region_name
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    'es',
    session_token=credentials.token
)

opensearch = OpenSearch(
    hosts=[{'host': endpoint, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

# Check index contents
print("\nChecking index...")
try:
    # Search for all documents
    response = opensearch.search(
        index=INDEX_NAME,
        body={
            "query": {"match_all": {}},
            "size": 1  # Just get one to check if index has content
        }
    )
    total_docs = response['hits']['total']['value']
    print(f"Found {total_docs} documents in index {INDEX_NAME}")
    
    if total_docs > 0:
        # Show sample document
        sample_doc = response['hits']['hits'][0]['_source']
        print("\nSample document:")
        print(f"Content length: {len(sample_doc['content'])} chars")
        print(f"Content preview: {sample_doc['content'][:200]}...")
        print(f"Metadata: {sample_doc.get('metadata', {})}")
    
except Exception as e:
    print(f"Error checking index: {str(e)}")
    raise

In [None]:
# Load small subset of data
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_TEST_SAMPLES = 3  # Small subset for testing

print("Loading dataset...")
dataset, documents = load_labeled_dataset(DATASET_DIR)
test_examples = dataset.examples[:NUM_TEST_SAMPLES]
print(f"Using {len(test_examples)} test examples")

# Show example structure
example = test_examples[0]
print("\nExample structure:")
print(f"Query: {example.query}")
print(f"Reference answer: {example.reference_answer}")
print(f"Number of reference contexts: {len(example.reference_contexts)}")

In [None]:
# Import RAG implementation
implementation_path = str(project_root / 'rag_implementations/baseline_rag/implementation.ipynb')
BaselineRAG = notebook_to_module(implementation_path).BaselineRAG

# Initialize RAG with same index name
rag = BaselineRAG(
    index_name=INDEX_NAME,
    chunk_size=500,
    chunk_overlap=50,
    enable_chunking=True
)

In [None]:
# Test semantic search directly
print("Testing semantic search...")
example = test_examples[0]
print(f"\nQuery: {example.query}")

# Get relevant documents
docs = rag.semantic_search(example.query, k=3)
print(f"\nRetrieved {len(docs)} documents")

for i, doc in enumerate(docs, 1):
    print(f"\nDocument {i}:")
    print(f"Content length: {len(doc['content'])} chars")
    print(f"Content preview: {doc['content'][:200]}...")
    print(f"Metadata: {doc.get('metadata', {})}")

In [None]:
# Generate answers and collect contexts
print("Generating answers...")
questions = []
answers = []
contexts = []
references = []

for example in test_examples:
    print(f"\nQuery: {example.query}")
    result = rag.query(example.query)
    
    questions.append(example.query)
    answers.append(result['response'])
    contexts.append([doc['content'] for doc in result['context']])
    references.append(example.reference_answer)  # Single string reference
    
    print(f"Retrieved {len(result['context'])} context documents")
    print(f"Answer: {result['response'][:100]}...")
    print(f"Reference: {example.reference_answer[:100]}...")

In [None]:
# Create RAGAs dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "reference": references  # Single string references
}

# Convert to dataset
eval_dataset = Dataset.from_dict(data)
print("Dataset structure:")
print(eval_dataset)

# Show first example
print("\nFirst example:")
example = eval_dataset[0]
print(f"Question: {example['question']}")
print(f"Answer: {example['answer'][:100]}...")
print(f"Number of contexts: {len(example['contexts'])}")
print(f"Reference: {example['reference'][:100]}...")

In [None]:
# Initialize Bedrock client
bedrock = boto3.client('bedrock-runtime')
llm_model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
embedding_model_id = "cohere.embed-english-v3"

# Initialize metrics with Bedrock models
metrics = [
    ContextPrecision(llm=llm_model_id),
    ContextRecall(llm=llm_model_id),
    Faithfulness(llm=llm_model_id),
    AnswerRelevancy(llm=llm_model_id, embeddings=embedding_model_id)
]

# Run evaluation
print("Running evaluation...")
try:
    results = evaluate(
        dataset=eval_dataset,
        metrics=metrics
    )
    
    # Convert to pandas for better display
    df = results.to_pandas()
    print("\nResults:")
    print(df)
    
except Exception as e:
    print(f"Error during evaluation: {type(e).__name__}")
    print(f"Error details: {str(e)}")
    print("\nDataset contents:")
    for key, value in data.items():
        print(f"\n{key}:")
        print(f"Type: {type(value)}")
        print(f"Length: {len(value)}")
        print(f"First item: {value[0][:100]}...")
    raise