# RAG Implementation Comparison Template

This notebook provides a template for comparing different RAG implementations. It can be used to evaluate and compare any RAG techniques, including but not limited to:
- Baseline RAG vs GraphRAG
- Different vector stores
- Different chunking strategies
- Different embedding models
- Different retrieval methods

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from typing import Dict, List, Any

# Import utilities
import utils_setup
from utils import RAGMetricsEvaluator, BenchmarkVisualizer, notebook_to_module

## Configuration

Define the implementations to compare and evaluation parameters.

In [None]:
# Evaluation configuration
config = {
    "implementations": {
        "implementation_1": {
            "name": "baseline_rag",
            "notebook_path": "../../rag_implementations/baseline_rag/implementation.ipynb"
        },
        "implementation_2": {
            "name": "graph_rag",
            "notebook_path": "../../rag_implementations/graph_rag/implementation.ipynb"
        }
    },
    "datasets": {
        "labeled": ["dataset1", "dataset2"],  # List of labeled datasets to use
        "unlabeled": ["dataset3"]  # List of unlabeled datasets to use
    },
    "metrics": {
        "labeled": [
            "faithfulness",
            "context_precision",
            "response_relevancy",
            "context_recall",
            "context_entities_recall"
        ],
        "unlabeled": [
            "faithfulness",
            "context_precision",
            "response_relevancy",
            "noise_sensitivity"
        ]
    },
    "batch_size": 20,
    "sleep_time": 1
}

## Load Implementations

Import the RAG implementations to compare.

In [None]:
# Load implementations
implementations = {}
for impl_id, impl_config in config["implementations"].items():
    try:
        module = notebook_to_module(impl_config["notebook_path"])
        implementations[impl_id] = module.RAGImplementation()
    except Exception as e:
        print(f"Error loading {impl_config['name']}: {str(e)}")

## Evaluation Functions

In [None]:
async def evaluate_on_dataset(dataset_name: str, implementations: Dict, is_labeled: bool = True):
    """Evaluate all implementations on a specific dataset."""
    # Initialize evaluator
    evaluator = RAGMetricsEvaluator(
        batch_size=config["batch_size"],
        sleep_time=config["sleep_time"]
    )
    
    results = {}
    
    # Load dataset
    if is_labeled:
        queries, contexts, reference_answers = load_llama_dataset(dataset_name)
    else:
        queries, contexts = load_unlabeled_dataset(dataset_name)
    
    # Evaluate each implementation
    for impl_id, implementation in implementations.items():
        # Generate answers
        generated_answers = [implementation.query(q) for q in queries]
        
        # Evaluate
        if is_labeled:
            results[impl_id] = await evaluator.evaluate_labeled(
                queries=queries,
                contexts=contexts,
                generated_answers=generated_answers,
                reference_answers=reference_answers
            )
        else:
            results[impl_id] = await evaluator.evaluate_unlabeled(
                queries=queries,
                contexts=contexts,
                generated_answers=generated_answers
            )
    
    return results

def visualize_results(results: Dict[str, Any], output_dir: str):
    """Create visualizations for evaluation results."""
    visualizer = BenchmarkVisualizer()
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate comprehensive report
    visualizer.create_comparison_report(results, output_dir)

## Run Evaluation

In [None]:
# Evaluate on labeled datasets
labeled_results = {}
for dataset in config["datasets"]["labeled"]:
    labeled_results[dataset] = await evaluate_on_dataset(
        dataset,
        implementations,
        is_labeled=True
    )

# Evaluate on unlabeled datasets
unlabeled_results = {}
for dataset in config["datasets"]["unlabeled"]:
    unlabeled_results[dataset] = await evaluate_on_dataset(
        dataset,
        implementations,
        is_labeled=False
    )

## Visualize Results

In [None]:
# Create results directory
results_dir = Path("results")
os.makedirs(results_dir, exist_ok=True)

# Visualize labeled dataset results
visualize_results(
    labeled_results,
    output_dir=str(results_dir / "labeled")
)

# Visualize unlabeled dataset results
visualize_results(
    unlabeled_results,
    output_dir=str(results_dir / "unlabeled")
)

## Analysis and Insights

Use this section to analyze the results and document insights about the comparison:

1. Performance Analysis
   - Compare metrics across implementations
   - Identify strengths and weaknesses
   - Note any patterns or trends

2. Resource Usage
   - Compare computational requirements
   - Analyze response times
   - Consider scaling implications

3. Quality Assessment
   - Evaluate answer quality
   - Compare context relevance
   - Assess faithfulness to sources

4. Recommendations
   - Suggest optimal use cases
   - Identify areas for improvement
   - Consider trade-offs