# Baseline RAG Benchmarking

This notebook benchmarks our baseline RAG implementation using the Origin of Covid-19 dataset.

## Features
- Automatic dataset loading
- Document ingestion with caching
- RAG evaluation using RAGAs metrics
- Performance visualization

## Process
1. Load dataset if not already downloaded
2. Process and ingest documents if not already in vector store
3. Run evaluation
4. Visualize results

In [None]:
import os
import json
import sys
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from tqdm import tqdm

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import our RAG implementation
from rag_implementations.baseline_rag.implementation import AWSConfig, BaselineRAG
from rag_implementations.baseline_rag.ingestion import ingest_documents

# Import utilities
from utils.metrics.rag_metrics import calculate_metrics
from utils.visualization.comparison_plots import plot_comparison_results

In [None]:
class DatasetManager:
    """Manages dataset downloading and loading"""
    
    def __init__(self, dataset_name: str = "OriginOfCovid19Dataset"):
        self.dataset_name = dataset_name
        self.dataset_dir = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
    
    def ensure_dataset_exists(self) -> Tuple[Any, List[Dict[str, Any]]]:
        """Download dataset if it doesn't exist, otherwise load from disk"""
        dataset_file = self.dataset_dir / "rag_dataset.json"
        source_dir = self.dataset_dir / "source_files"
        
        if not dataset_file.exists() or not source_dir.exists():
            print(f"Downloading {self.dataset_name}...")
            from llama_index.core.llama_dataset import download_llama_dataset
            return download_llama_dataset(self.dataset_name, str(self.dataset_dir))
        else:
            print(f"Loading existing dataset from {self.dataset_dir}...")
            from llama_index.core.llama_dataset import LabelledRagDataset
            from llama_index.core import SimpleDirectoryReader
            
            dataset = LabelledRagDataset.from_json(str(dataset_file))
            documents = SimpleDirectoryReader(str(source_dir)).load_data()
            return dataset, documents

In [None]:
class VectorStoreManager:
    """Manages vector store operations"""
    
    def __init__(self, dataset_name: str):
        self.dataset_name = dataset_name
        self.index_name = f"{dataset_name.lower()}-rag-documents"
    
    def is_ingested(self, rag_system: BaselineRAG) -> bool:
        """Check if documents are already ingested"""
        try:
            # Try to get a document count
            response = rag_system.config.opensearch.count(index=self.index_name)
            return response['count'] > 0
        except Exception:
            return False
    
    def prepare_documents(self, documents: List[Any]) -> List[Dict[str, Any]]:
        """Prepare documents for ingestion"""
        prepared_docs = []
        
        for doc in documents:
            prepared_docs.append({
                'content': doc.text,
                'metadata': {
                    'dataset': self.dataset_name,
                    **doc.metadata
                }
            })
        
        return prepared_docs
    
    def ingest_if_needed(self, rag_system: BaselineRAG, documents: List[Any]):
        """Ingest documents if not already in vector store"""
        if not self.is_ingested(rag_system):
            print("Ingesting documents...")
            prepared_docs = self.prepare_documents(documents)
            rag_system.ingest_documents(prepared_docs)
        else:
            print("Documents already ingested")

In [None]:
def evaluate_rag(rag_system: BaselineRAG, dataset, num_samples: Optional[int] = None) -> Dict:
    """Evaluate RAG system using dataset"""
    examples = dataset.examples
    if num_samples:
        from random import sample
        examples = sample(examples, min(num_samples, len(examples)))
    
    results = []
    for example in tqdm(examples, desc="Evaluating"):
        response = rag_system.query(example.query)
        
        metrics = calculate_metrics(
            query=example.query,
            response=response['response'],
            context=response['context'],
            ground_truth=example.reference_answer
        )
        
        results.append({
            'query': example.query,
            'response': response['response'],
            'ground_truth': example.reference_answer,
            'context': response['context'],
            'metrics': metrics
        })
    
    # Aggregate metrics
    aggregated = {
        metric: sum(r['metrics'][metric] for r in results) / len(results)
        for metric in results[0]['metrics'].keys()
    }
    
    return {
        'individual_results': results,
        'aggregated_metrics': aggregated
    }

In [None]:
# Configuration
DATASET_NAME = "OriginOfCovid19Dataset"
NUM_EVAL_SAMPLES = None  # Set to a number for partial evaluation

# Initialize managers
dataset_manager = DatasetManager(DATASET_NAME)
vector_store_manager = VectorStoreManager(DATASET_NAME)

In [None]:
# Load or download dataset
dataset, documents = dataset_manager.ensure_dataset_exists()
print(f"Dataset loaded: {len(dataset.examples)} examples, {len(documents)} documents")

In [None]:
# Initialize RAG system and ingest documents if needed
config = AWSConfig()
rag = BaselineRAG(config, index_name=vector_store_manager.index_name)
vector_store_manager.ingest_if_needed(rag, documents)

In [None]:
# Run evaluation
print("Running evaluation...")
results = evaluate_rag(rag, dataset, num_samples=NUM_EVAL_SAMPLES)

# Display results
print("\nAggregated Metrics:")
for metric, value in results['aggregated_metrics'].items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Save results
results_dir = Path("results")
results_dir.mkdir(exist_ok=True)

results_data = {
    'dataset': DATASET_NAME,
    'num_examples': len(dataset.examples),
    'num_documents': len(documents),
    'num_evaluated': len(results['individual_results']),
    'results': results
}

with open(results_dir / 'baseline_rag_results.json', 'w') as f:
    json.dump(results_data, f, indent=2)

print(f"Results saved to {results_dir / 'baseline_rag_results.json'}")

In [None]:
# Visualize results
plot_comparison_results(
    {
        'Baseline RAG': results['aggregated_metrics']
    },
    title=f'RAG Evaluation Results ({DATASET_NAME})'
)

In [None]:
# Example queries
example_queries = [
    "What is the main focus of the article 'The Origin of COVID-19 and Why It Matters'?",
    "What evidence suggests that SARS-CoV-2 emerged naturally rather than being engineered?",
    "What are some potential consequences of not understanding how COVID-19 emerged?"
]

print("Testing example queries...\n")
for query in example_queries:
    print(f"Query: {query}")
    result = rag.query(query)
    print(f"Response: {result['response']}\n")