# RAG Implementation Comparison Template

This notebook provides a template for comparing different RAG implementations. It can be used to evaluate and compare any RAG techniques, including but not limited to:
- Baseline RAG vs GraphRAG
- Different vector stores
- Different chunking strategies
- Different embedding models
- Different retrieval methods

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Vector Search
- k: Number of context documents to retrieve (default: 3)
- search_type: Type of vector search to use ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score to include (default: None)

### OpenSearch
- index_settings: Custom index settings for performance tuning
- knn_params: Parameters for k-NN algorithm (e.g., ef_search)

### API Settings
- max_retries: Maximum number of retry attempts (default: 5)
- min_delay: Minimum delay between retries in seconds (default: 1)
- max_delay: Maximum delay between retries in seconds (default: 60)

## Prerequisites
- Run setup.ipynb first to configure environment

In [None]:
import os
import json
import sys
import boto3
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from datasets import Dataset
from datetime import datetime

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.aws.opensearch_utils import OpenSearchManager
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    save_dataset_info
)
from utils.notebook_utils.importable import notebook_to_module

## Configuration

In [None]:
# Implementation Configuration
IMPLEMENTATIONS = {
    "implementation_1": {
        "name": "baseline_rag",
        "notebook_path": "../../rag_implementations/baseline_rag/implementation.ipynb",
        "config": {
            # Document processing
            "chunk_size": 500,  # 500 words ≈ 2000 chars
            "chunk_overlap": 50,  # 50 words overlap
            "enable_chunking": True,
            
            # Vector search
            "search_type": "script",  # 'script' or 'knn'
            "similarity_threshold": None,  # Minimum similarity score
            
            # OpenSearch settings
            "index_settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "knn": {
                    "algo_param": {
                        "ef_search": 512  # Higher = more accurate but slower
                    }
                }
            },
            "knn_params": {
                "ef_construction": 512,  # Higher = more accurate index
                "m": 16  # Higher = more connections per node
            },
            
            # API settings
            "max_retries": 5,
            "min_delay": 1.0,
            "max_delay": 60.0
        }
    },
    "implementation_2": {
        "name": "graph_rag",
        "notebook_path": "../../rag_implementations/graph_rag/implementation.ipynb",
        "config": {
            # Add GraphRAG specific configuration
        }
    }
}

# Dataset Configuration
DATASETS = {
    "labeled": {
        "covid19": {
            "name": "OriginOfCovid19Dataset",
            "dir": "datasets/rag_evaluation/labeled/covid19_origin"
        }
    },
    "unlabeled": {
        # Add unlabeled datasets here
    }
}

# OpenSearch Configuration
OPENSEARCH_CONFIG = {
    "domain_prefix": "rag-comparison",
    "cleanup_resources": True  # Default to cleaning up to avoid unexpected costs
}

print("Note: This notebook uses Amazon OpenSearch which incurs costs.")
print("CLEANUP_RESOURCES is enabled by default to delete resources after benchmarking.")
print("Set CLEANUP_RESOURCES = False if you want to preserve the OpenSearch domain.")

## Load Implementations

In [None]:
def load_implementation(impl_config: Dict[str, Any]) -> Any:
    """Load a RAG implementation from notebook."""
    try:
        module = notebook_to_module(impl_config["notebook_path"])
        implementation = module.RAGImplementation(**impl_config["config"])
        return implementation
    except Exception as e:
        print(f"Error loading {impl_config['name']}: {str(e)}")
        raise

# Load implementations
implementations = {}
for impl_id, impl_config in IMPLEMENTATIONS.items():
    implementations[impl_id] = load_implementation(impl_config)

## OpenSearch Setup

In [None]:
def setup_opensearch(implementation_name: str) -> OpenSearchManager:
    """Set up OpenSearch for an implementation."""
    domain_name = f"{OPENSEARCH_CONFIG['domain_prefix']}-{implementation_name}"
    
    print(f"Setting up OpenSearch for {implementation_name}...")
    manager = OpenSearchManager(
        domain_name=domain_name,
        cleanup_enabled=OPENSEARCH_CONFIG["cleanup_resources"],
        verbose=False
    )
    
    # Set up domain and get endpoint
    endpoint = manager.setup_domain()
    os.environ['OPENSEARCH_HOST'] = endpoint
    
    return manager

## Evaluation Functions

In [None]:
def run_evaluation(implementation: Any, dataset: Dataset, evaluator: RAGMetricsEvaluator):
    """Run evaluation for a single implementation on a dataset."""
    print(f"Running evaluation...")
    total = len(dataset.examples)
    
    # Progress bar for overall evaluation
    with tqdm_notebook(total=total, desc="Evaluating") as pbar:
        # Generate answers with progress tracking
        questions = []
        contexts = []
        answers = []
        references = []
        
        for i, example in enumerate(dataset.examples):
            try:
                result = implementation.query(example.query)
                
                questions.append(example.query)
                contexts.append([doc['content'] for doc in result['context']])
                answers.append(result['response'])
                references.append(example.reference_answer)
                
                pbar.update(1)
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': 'Success'
                })
            except Exception as e:
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': f'Error: {type(e).__name__}'
                })
                raise
    
    # Create evaluation dataset
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "reference": references
    }
    eval_dataset = Dataset.from_dict(data)
    
    # Evaluate results
    print("\nCalculating metrics...")
    try:
        results = evaluator.evaluate_labeled(
            queries=questions,
            contexts=contexts,
            generated_answers=answers,
            reference_answers=references,
            plot_results=True
        )
        
        # Convert results to pandas DataFrame
        df = results.to_pandas()
        
        # Return both raw results and DataFrame
        return {
            'raw_results': results,
            'metrics_df': df.to_dict(),
            'data': data
        }
        
    except Exception as e:
        print(f"Error during evaluation: {type(e).__name__}")
        print(f"Error details: {str(e)}")
        print("\nDataset contents:")
        for key, value in data.items():
            print(f"\n{key}:")
            print(f"Type: {type(value)}")
            print(f"Length: {len(value)}")
            print(f"First item: {value[0][:100]}...")
        raise

## Run Comparison

In [None]:
# Initialize evaluator
evaluator = RAGMetricsEvaluator()

# Store results for each implementation
all_results = {}

# Run evaluation for each implementation
for impl_id, implementation in implementations.items():
    impl_name = IMPLEMENTATIONS[impl_id]["name"]
    print(f"\nEvaluating {impl_name}...")
    
    # Set up OpenSearch
    manager = setup_opensearch(impl_name)
    
    # Store results for this implementation
    impl_results = {}
    
    # Evaluate on each dataset
    for dataset_type, datasets in DATASETS.items():
        for dataset_id, dataset_config in datasets.items():
            print(f"\nEvaluating on {dataset_config['name']}...")
            
            # Load dataset
            dataset_dir = project_root / dataset_config["dir"]
            dataset, documents = load_labeled_dataset(dataset_dir)
            
            # Run evaluation
            results = run_evaluation(implementation, dataset, evaluator)
            impl_results[dataset_id] = results
    
    # Store results
    all_results[impl_id] = impl_results
    
    # Cleanup OpenSearch
    manager.cleanup()

## Save Results

In [None]:
def save_results(results: Dict[str, Any]):
    """Save evaluation results."""
    results_dir = project_root / "evaluation_pipelines/rag_evaluations/results"
    results_dir.mkdir(exist_ok=True)
    
    # Prepare results data
    results_data = {
        'implementations': IMPLEMENTATIONS,
        'datasets': DATASETS,
        'results': {}
    }
    
    # Format results for each implementation
    for impl_id, impl_results in results.items():
        impl_name = IMPLEMENTATIONS[impl_id]["name"]
        results_data['results'][impl_name] = {}
        
        for dataset_id, dataset_results in impl_results.items():
            results_data['results'][impl_name][dataset_id] = {
                'metrics': dataset_results['metrics_df'],
                'evaluation_data': {
                    'questions': dataset_results['data']['question'],
                    'answers': dataset_results['data']['answer'],
                    'references': dataset_results['data']['reference']
                }
            }
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = results_dir / f'rag_comparison_results_{timestamp}.json'
    with open(results_file, 'w') as f:
        json.dump(results_data, f, indent=2)
    
    print(f"Results saved to {results_file}")

# Save all results
save_results(all_results)

## Example Queries

In [None]:
def test_example_queries(implementation: Any):
    """Test implementation with example queries."""
    example_queries = [
        "What is the main focus of the article?",
        "What evidence supports the main argument?",
        "What are the key implications discussed?"
    ]
    
    print("Testing example queries...\n")
    for query in example_queries:
        print(f"Query: {query}")
        result = implementation.query(query)
        print(f"Response: {result['response']}\n")

# Test each implementation
for impl_id, implementation in implementations.items():
    impl_name = IMPLEMENTATIONS[impl_id]["name"]
    print(f"\nTesting {impl_name}...")
    test_example_queries(implementation)

## Analysis and Insights

Use this section to analyze the results and document insights about the comparison:

1. Performance Analysis
   - Compare metrics across implementations
   - Identify strengths and weaknesses
   - Note any patterns or trends

2. Resource Usage
   - Compare computational requirements
   - Analyze response times
   - Consider scaling implications

3. Quality Assessment
   - Evaluate answer quality
   - Compare context relevance
   - Assess faithfulness to sources

4. Recommendations
   - Suggest optimal use cases
   - Identify areas for improvement
   - Consider trade-offs