# Baseline RAG Benchmarking

This notebook benchmarks our baseline RAG implementation using the Origin of Covid-19 dataset.

## Prerequisites
- Run setup.ipynb first to configure environment

## Process
1. Load and examine dataset
2. Run baseline RAG evaluation
3. Analyze and visualize results

In [None]:
import os
import json
import sys
import boto3
import asyncio
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm import tqdm

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.visualization.comparison_plots import BenchmarkVisualizer
from utils.aws.opensearch_utils import OpenSearchManager
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    prepare_documents_for_rag,
    save_dataset_info
)

# Import RAG implementation
from utils.notebook_utils.importable import notebook_to_module

In [None]:
# Configuration
DATASET_NAME = "OriginOfCovid19Dataset"
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_EVAL_SAMPLES = None  # Set to a number for partial evaluation

# OpenSearch Configuration
OPENSEARCH_DOMAIN = "baseline-rag-benchmark-store"
CLEANUP_RESOURCES = True  # Default to cleaning up to avoid unexpected costs

print("Note: This notebook uses Amazon OpenSearch which incurs costs.")
print("CLEANUP_RESOURCES is enabled by default to delete resources after benchmarking.")
print("Set CLEANUP_RESOURCES = False if you want to preserve the OpenSearch domain.")

In [None]:
# OpenSearch Setup
print("Setting up OpenSearch...")
manager = OpenSearchManager(
    domain_name=OPENSEARCH_DOMAIN,
    cleanup_enabled=CLEANUP_RESOURCES,
    verbose=False  # Don't print detailed status
)

# Set up domain and get endpoint
endpoint = manager.setup_domain()
os.environ['OPENSEARCH_HOST'] = endpoint

# Import BaselineRAG after setting OPENSEARCH_HOST
implementation_path = str(project_root / 'rag_implementations/baseline_rag/implementation.ipynb')
baseline_rag = notebook_to_module(implementation_path)
BaselineRAG = baseline_rag.BaselineRAG

# Initialize evaluator and visualizer
evaluator = RAGMetricsEvaluator()
visualizer = BenchmarkVisualizer()

In [None]:
# Load and examine dataset
print(f"Loading {DATASET_NAME}...")
# Will automatically download if not found
dataset, documents = load_labeled_dataset(DATASET_DIR, download_if_missing=True)
print(f"Loaded {len(dataset.examples)} examples and {len(documents)} documents")

# Examine dataset structure
dataset_info = examine_dataset_structure(dataset, documents)
print("\nDataset Structure:")
print(json.dumps(dataset_info, indent=2))

# Save dataset info
save_dataset_info(dataset_info, DATASET_DIR / 'dataset_info.json')
print(f"\nDataset information saved to: {DATASET_DIR / 'dataset_info.json'}")

In [None]:
# Initialize baseline RAG
print("Initializing baseline RAG...")
rag = BaselineRAG(index_name=f"{DATASET_NAME.lower()}-benchmark")

# Prepare and ingest documents
print("Preparing documents...")
prepared_docs = prepare_documents_for_rag(documents, DATASET_NAME)

# Check if documents already exist
print("Checking existing documents...")
try:
    doc_count = rag.opensearch.count(index=rag.index_name)['count']
    if doc_count > 0:
        print(f"Found {doc_count} documents already indexed")
        print("Skipping ingestion to avoid duplicates")
    else:
        print("Ingesting documents...")
        rag.ingest_documents(prepared_docs)
except:
    # Index doesn't exist yet
    print("Ingesting documents...")
    rag.ingest_documents(prepared_docs)

In [None]:
# Run evaluation
async def run_evaluation():
    print("Running evaluation...")
    eval_examples = dataset.examples[:NUM_EVAL_SAMPLES] if NUM_EVAL_SAMPLES else dataset.examples

    results = await evaluator.evaluate_labeled(
        queries=[ex.query for ex in eval_examples],
        contexts=[[doc.text] for doc in documents],
        generated_answers=[rag.query(ex.query)['response'] for ex in eval_examples],
        reference_answers=[ex.reference_answer for ex in eval_examples]
    )

    # Display results
    print("\nEvaluation Results:")
    for metric, score in results.items():
        print(f"{metric}: {score:.4f}")
    
    return results

results = await run_evaluation()

In [None]:
# Save results
results_dir = project_root / "results"
results_dir.mkdir(exist_ok=True)

results_data = {
    'dataset': DATASET_NAME,
    'num_examples': len(dataset.examples),
    'num_documents': len(documents),
    'num_evaluated': len(eval_examples),
    'metrics': results
}

with open(results_dir / 'baseline_rag_results.json', 'w') as f:
    json.dump(results_data, f, indent=2)

print(f"Results saved to {results_dir / 'baseline_rag_results.json'}")

In [None]:
# Visualize results
visualizer.plot_comparison(
    data={'Baseline RAG': results},
    comparison_type="metrics",
    plot_type="bar",
    title=f'Baseline RAG Evaluation ({DATASET_NAME})'
)

In [None]:
# Example queries
example_queries = [
    "What is the main focus of the article 'The Origin of COVID-19 and Why It Matters'?",
    "What evidence suggests that SARS-CoV-2 emerged naturally rather than being engineered?",
    "What are some potential consequences of not understanding how COVID-19 emerged?"
]

print("Testing example queries...\n")
for query in example_queries:
    print(f"Query: {query}")
    result = rag.query(query)
    print(f"Response: {result['response']}\n")

In [None]:
# Resource Cleanup
manager.cleanup()