# Baseline RAG Benchmarking

This notebook benchmarks a baseline RAG implementation using OpenSearch for vector storage.

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)

### Vector Search
- search_type: Type of vector search ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score (default: None)

### API Settings
- max_retries: Maximum retry attempts (default: 5)
- min_delay: Minimum retry delay in seconds (default: 1)
- max_delay: Maximum retry delay in seconds (default: 60)

In [None]:
import os
import sys
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    save_dataset_info,
    convert_to_ragas_dataset
)
from utils.notebook_utils.importable import notebook_to_module

In [None]:
# Dataset Configuration
DATASET_NAME = "OriginOfCovid19Dataset"
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_EVAL_SAMPLES = None  # Set to a number for partial evaluation

# RAG Configuration
RAG_CONFIG = {
    # Document processing
    "chunk_size": 500,  # 500 words ≈ 2000 chars
    "chunk_overlap": 50,  # 50 words overlap
    "enable_chunking": True,
    
    # Vector search
    "search_type": "script",
    "similarity_threshold": None,
    
    # OpenSearch config
    "index_settings": None,
    
    # API settings
    "max_retries": 5,
    "min_delay": 1.0,
    "max_delay": 60.0
}

print("Note: This notebook uses Amazon OpenSearch which incurs costs.")
print("OpenSearch resources will be cleaned up after benchmarking.")

In [None]:
# Import implementation
implementation_path = str(project_root / 'rag_implementations/baseline_rag/implementation.ipynb')
ingestion_path = str(project_root / 'rag_implementations/baseline_rag/ingestion.ipynb')
BaselineRAG = notebook_to_module(implementation_path).BaselineRAG
ingest_documents = notebook_to_module(ingestion_path).ingest_documents

# Initialize evaluator
evaluator = RAGMetricsEvaluator()

In [None]:
# Load and examine dataset
print(f"Loading {DATASET_NAME}...")
dataset, documents = load_labeled_dataset(DATASET_DIR, download_if_missing=True)
print(f"Loaded {len(dataset.examples)} examples and {len(documents)} documents")

# Get evaluation examples
eval_examples = dataset.examples[:NUM_EVAL_SAMPLES] if NUM_EVAL_SAMPLES else dataset.examples
print(f"Using {len(eval_examples)} examples for evaluation")

# Examine dataset structure
dataset_info = examine_dataset_structure(dataset, documents)
print("\nDataset Structure:")
print(json.dumps(dataset_info, indent=2))

# Save dataset info
save_dataset_info(dataset_info, DATASET_DIR / 'dataset_info.json')
print(f"\nDataset information saved to: {DATASET_DIR / 'dataset_info.json'}")

In [None]:
# Initialize BaselineRAG with configuration
print("\nInitializing BaselineRAG...")
rag = BaselineRAG(
    index_name=f"{DATASET_NAME.lower()}",  # Shortened to avoid domain name length issues
    **RAG_CONFIG
)

# Check if documents already exist
print("\nChecking existing documents...")
doc_count = rag.vector_store.client.search(
    index=rag.index_name,
    body={"query": {"match_all": {}}}
)['hits']['total']['value']
print(f"Found {doc_count} documents already indexed")

if doc_count > 0:
    print("Skipping ingestion since documents already exist")
else:
    print("Ingesting documents...")
    source_dir = DATASET_DIR / "source_files"
    ingest_documents(
        str(source_dir),
        rag,
        metadata={'dataset': DATASET_NAME},
        batch_size=100
    )

In [None]:
# Run evaluation
print("\nRunning evaluation...")
results = evaluator.evaluate_unlabeled(
    queries=[example.query for example in eval_examples],
    contexts=[[doc['content'] for doc in rag.query(example.query, k=5)['context']] for example in eval_examples],
    generated_answers=[rag.query(example.query, k=5)['response'] for example in eval_examples],
    plot_results=True
)

In [None]:
# Example queries
example_queries = [
    "What is the main focus of the article 'The Origin of COVID-19 and Why It Matters'?",
    "What evidence suggests that SARS-CoV-2 emerged naturally rather than being engineered?",
    "What are some potential consequences of not understanding how COVID-19 emerged?"
]

print("Testing example queries...\n")
for query in example_queries:
    print(f"Query: {query}")
    result = rag.query(query, k=5)
    print(f"Response: {result['response']}\n")
    print("Context:")
    for doc in result['context']:
        print(f"  - {doc['content'][:200]}...")  # Print first 200 characters
    print("-" * 80)

In [None]:
# Resource Cleanup
print("=== Cleaning Up Resources ===")
print("Warning: This will delete all resources and indexed data")
print("This may take 15-20 minutes to complete")
try:
    rag.cleanup(delete_resources=True)  # Delete index and domain
    print("✅ Cleanup successful")
except Exception as e:
    print(f"❌ Error during cleanup: {str(e)}")
    print("Some resources may need to be cleaned up manually")