# Baseline RAG Benchmarking

This notebook benchmarks our baseline RAG implementation using the Origin of Covid-19 dataset.

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Vector Search
- k: Number of context documents to retrieve (default: 3)
- search_type: Type of vector search to use ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score to include (default: None)

### OpenSearch
- index_settings: Custom index settings for performance tuning
- knn_params: Parameters for k-NN algorithm (e.g., ef_search)

### API Settings
- max_retries: Maximum number of retry attempts (default: 5)
- min_delay: Minimum delay between retries in seconds (default: 1)
- max_delay: Maximum delay between retries in seconds (default: 60)

## Prerequisites
- Run setup.ipynb first to configure environment

## Process
1. Configure RAG parameters
2. Load and examine dataset
3. Run baseline RAG evaluation
4. Analyze and visualize results

In [None]:
import os
import json
import sys
import boto3
import asyncio
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from ragas.evaluation import EvaluationDataset, Sample

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.aws.opensearch_utils import OpenSearchManager
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.visualization.comparison_plots import BenchmarkVisualizer
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    save_dataset_info
)
from utils.notebook_utils.importable import notebook_to_module

In [None]:
# Dataset Configuration
DATASET_NAME = "OriginOfCovid19Dataset"
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_EVAL_SAMPLES = None  # Set to a number for partial evaluation

# OpenSearch Configuration
OPENSEARCH_DOMAIN = "baseline-rag-benchmark-store"
CLEANUP_RESOURCES = True  # Default to cleaning up to avoid unexpected costs

# RAG Configuration
RAG_CONFIG = {
    # Document processing
    "chunk_size": 500,  # 500 words ≈ 2000 chars
    "chunk_overlap": 50,  # 50 words overlap
    "enable_chunking": True,
    
    # Vector search
    "search_type": "script",  # 'script' or 'knn'
    "similarity_threshold": None,  # Minimum similarity score
    
    # OpenSearch settings
    "index_settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "knn": {
            "algo_param": {
                "ef_search": 512  # Higher = more accurate but slower
            }
        }
    },
    "knn_params": {
        "ef_construction": 512,  # Higher = more accurate index
        "m": 16  # Higher = more connections per node
    },
    
    # API settings
    "max_retries": 5,
    "min_delay": 1.0,
    "max_delay": 60.0
}

print("Note: This notebook uses Amazon OpenSearch which incurs costs.")
print("CLEANUP_RESOURCES is enabled by default to delete resources after benchmarking.")
print("Set CLEANUP_RESOURCES = False if you want to preserve the OpenSearch domain.")

In [None]:
# OpenSearch Setup
print("Setting up OpenSearch...")
manager = OpenSearchManager(
    domain_name=OPENSEARCH_DOMAIN,
    cleanup_enabled=CLEANUP_RESOURCES,
    verbose=False  # Don't print detailed status
)

# Set up domain and get endpoint
endpoint = manager.setup_domain()
os.environ['OPENSEARCH_HOST'] = endpoint

# Import implementations
implementation_path = str(project_root / 'rag_implementations/baseline_rag/implementation.ipynb')
ingestion_path = str(project_root / 'rag_implementations/baseline_rag/ingestion.ipynb')
BaselineRAG = notebook_to_module(implementation_path).BaselineRAG
ingest_documents = notebook_to_module(ingestion_path).ingest_documents

# Initialize evaluator and visualizer
evaluator = RAGMetricsEvaluator()
visualizer = BenchmarkVisualizer()

In [None]:
# Load and examine dataset
print(f"Loading {DATASET_NAME}...")
# Will automatically download if not found
dataset, documents = load_labeled_dataset(DATASET_DIR, download_if_missing=True)
print(f"Loaded {len(dataset.examples)} examples and {len(documents)} documents")

# Examine dataset structure
dataset_info = examine_dataset_structure(dataset, documents)
print("\nDataset Structure:")
print(json.dumps(dataset_info, indent=2))

# Save dataset info
save_dataset_info(dataset_info, DATASET_DIR / 'dataset_info.json')
print(f"\nDataset information saved to: {DATASET_DIR / 'dataset_info.json'}")

In [None]:
# Initialize baseline RAG with configuration
print("Initializing baseline RAG...")
rag = BaselineRAG(
    index_name=f"{DATASET_NAME.lower()}-benchmark",
    chunk_size=RAG_CONFIG["chunk_size"],
    chunk_overlap=RAG_CONFIG["chunk_overlap"],
    enable_chunking=RAG_CONFIG["enable_chunking"],
    search_type=RAG_CONFIG["search_type"],
    similarity_threshold=RAG_CONFIG["similarity_threshold"],
    index_settings=RAG_CONFIG["index_settings"],
    knn_params=RAG_CONFIG["knn_params"],
    max_retries=RAG_CONFIG["max_retries"],
    min_delay=RAG_CONFIG["min_delay"],
    max_delay=RAG_CONFIG["max_delay"]
)

# Check if documents already exist
print("\nChecking existing documents...")
try:
    doc_count = rag.opensearch.count(index=rag.index_name)['count']
    if doc_count > 0:
        print(f"Found {doc_count} documents already indexed")
        print("Skipping ingestion to avoid duplicates")
    else:
        print("Ingesting documents...")
        # Use new Langchain-based ingestion
        source_dir = DATASET_DIR / "source_files"
        ingest_documents(
            str(source_dir),
            rag,
            metadata={'dataset': DATASET_NAME},
            batch_size=100
        )
except:
    # Index doesn't exist yet
    print("Ingesting documents...")
    # Use new Langchain-based ingestion
    source_dir = DATASET_DIR / "source_files"
    ingest_documents(
        str(source_dir),
        rag,
        metadata={'dataset': DATASET_NAME},
        batch_size=100
    )

In [None]:
# Run evaluation
async def run_evaluation():
    print("Running evaluation...")
    eval_examples = dataset.examples[:NUM_EVAL_SAMPLES] if NUM_EVAL_SAMPLES else dataset.examples
    total = len(eval_examples)
    
    # Progress bar for overall evaluation
    with tqdm_notebook(total=total, desc="Evaluating") as pbar:
        # Generate answers with progress tracking
        samples = []
        for i, example in enumerate(eval_examples):
            try:
                result = rag.query(example.query)
                # Create RAGAs sample
                sample = Sample(
                    question=example.query,
                    contexts=[doc.text for doc in documents],
                    answer=result['response'],
                    ground_truth=example.reference_answer
                )
                samples.append(sample)
                pbar.update(1)
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': 'Success'
                })
            except Exception as e:
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': f'Error: {type(e).__name__}'
                })
                raise

    # Create evaluation dataset
    print("\nPreparing evaluation dataset...")
    eval_dataset = EvaluationDataset(samples=samples)

    # Evaluate results
    print("\nCalculating metrics...")
    try:
        results = await evaluator.evaluate_labeled(
            queries=[ex.query for ex in eval_examples],
            contexts=[[doc.text] for doc in documents],
            generated_answers=[s.answer for s in samples],
            reference_answers=[ex.reference_answer for ex in eval_examples]
        )
    except Exception as e:
        print(f"Error during evaluation: {type(e).__name__}")
        print(f"Error details: {str(e)}")
        print("\nDataset structure:")
        print(f"Number of samples: {len(samples)}")
        print("Sample structure:")
        print(f"- question: {samples[0].question[:100]}...")
        print(f"- contexts: {len(samples[0].contexts)} contexts")
        print(f"- answer: {samples[0].answer[:100]}...")
        print(f"- ground_truth: {samples[0].ground_truth[:100]}...")
        raise

    # Display results
    print("\nEvaluation Results:")
    for metric, score in results.items():
        print(f"{metric}: {score:.4f}")
    
    return results

results = await run_evaluation()

In [None]:
# Save results
results_dir = project_root / "evaluation_pipelines/rag_evaluations/results"
results_dir.mkdir(exist_ok=True)

results_data = {
    'dataset': DATASET_NAME,
    'num_examples': len(dataset.examples),
    'num_documents': len(documents),
    'num_evaluated': len(eval_examples),
    'rag_config': RAG_CONFIG,
    'metrics': results
}

results_file = results_dir / f'baseline_rag_results_{DATASET_NAME.lower()}.json'
with open(results_file, 'w') as f:
    json.dump(results_data, f, indent=2)

print(f"Results saved to {results_file}")

In [None]:
# Visualize results
visualizer.plot_comparison(
    data={'Baseline RAG': results},
    comparison_type="metrics",
    plot_type="bar",
    title=f'Baseline RAG Evaluation ({DATASET_NAME})'
)

In [None]:
# Example queries
example_queries = [
    "What is the main focus of the article 'The Origin of COVID-19 and Why It Matters'?",
    "What evidence suggests that SARS-CoV-2 emerged naturally rather than being engineered?",
    "What are some potential consequences of not understanding how COVID-19 emerged?"
]

print("Testing example queries...\n")
for query in example_queries:
    print(f"Query: {query}")
    result = rag.query(query)
    print(f"Response: {result['response']}\n")

In [None]:
# Resource Cleanup
manager.cleanup()