# RAG Implementation Tuning Template

This notebook provides a template for tuning a RAG implementation's parameters and configurations. It can be used to optimize:
- Chunking parameters
- Retrieval settings
- Model parameters
- Embedding configurations
- Any other implementation-specific parameters

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Vector Search
- k: Number of context documents to retrieve (default: 3)
- search_type: Type of vector search to use ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score to include (default: None)

### OpenSearch
- index_settings: Custom index settings for performance tuning
- knn_params: Parameters for k-NN algorithm (e.g., ef_search)

### API Settings
- max_retries: Maximum number of retry attempts (default: 5)
- min_delay: Minimum delay between retries in seconds (default: 1)
- max_delay: Maximum delay between retries in seconds (default: 60)

## Prerequisites
- Run setup.ipynb first to configure environment

In [None]:
import os
import json
import sys
import boto3
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from datasets import Dataset
from datetime import datetime
from itertools import product

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.aws.opensearch_utils import OpenSearchManager
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    save_dataset_info
)
from utils.notebook_utils.importable import notebook_to_module

## Configuration

In [None]:
# Implementation Configuration
IMPLEMENTATION = {
    "name": "baseline_rag",
    "notebook_path": "../../rag_implementations/baseline_rag/implementation.ipynb",
    "parameter_grid": {
        # Document processing
        "chunk_size": [256, 512, 1024],  # Words per chunk
        "chunk_overlap": [0, 50, 100],  # Words overlap
        "enable_chunking": [True],
        
        # Vector search
        "search_type": ["script", "knn"],
        "similarity_threshold": [None, 0.7, 0.8],
        
        # OpenSearch settings
        "index_settings": [{
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "knn": {
                "algo_param": {
                    "ef_search": ef  # Higher = more accurate but slower
                }
            }
        } for ef in [256, 512]],
        
        "knn_params": [{
            "ef_construction": ef,  # Higher = more accurate index
            "m": m  # Higher = more connections per node
        } for ef, m in [(256, 16), (512, 16), (512, 32)]],
        
        # API settings
        "max_retries": [3, 5],
        "min_delay": [1.0],
        "max_delay": [60.0]
    }
}

# Dataset Configuration
DATASETS = {
    "tuning": {
        "covid19": {
            "name": "OriginOfCovid19Dataset",
            "dir": "datasets/rag_evaluation/labeled/covid19_origin"
        }
    },
    "validation": {
        "paul_graham": {
            "name": "PaulGrahamEssaysDataset",
            "dir": "datasets/rag_evaluation/labeled/paul_graham_essays"
        }
    }
}

# OpenSearch Configuration
OPENSEARCH_CONFIG = {
    "domain_prefix": "rag-tuning",
    "cleanup_resources": True  # Default to cleaning up to avoid unexpected costs
}

# Metric Weights for Optimization
METRIC_WEIGHTS = {
    "faithfulness": 0.3,
    "context_precision": 0.2,
    "response_relevancy": 0.3,
    "context_recall": 0.1,
    "context_entities_recall": 0.1
}

print("Note: This notebook uses Amazon OpenSearch which incurs costs.")
print("CLEANUP_RESOURCES is enabled by default to delete resources after benchmarking.")
print("Set CLEANUP_RESOURCES = False if you want to preserve the OpenSearch domain.")

## Load Implementation

In [None]:
def load_implementation(params: Dict[str, Any]) -> Any:
    """Load a RAG implementation with specific parameters."""
    try:
        module = notebook_to_module(IMPLEMENTATION["notebook_path"])
        implementation = module.RAGImplementation(**params)
        return implementation
    except Exception as e:
        print(f"Error loading implementation: {str(e)}")
        raise

## OpenSearch Setup

In [None]:
def setup_opensearch(trial_id: str) -> OpenSearchManager:
    """Set up OpenSearch for a tuning trial."""
    domain_name = f"{OPENSEARCH_CONFIG['domain_prefix']}-{trial_id}"
    
    print(f"Setting up OpenSearch for trial {trial_id}...")
    manager = OpenSearchManager(
        domain_name=domain_name,
        cleanup_enabled=OPENSEARCH_CONFIG["cleanup_resources"],
        verbose=False
    )
    
    # Set up domain and get endpoint
    endpoint = manager.setup_domain()
    os.environ['OPENSEARCH_HOST'] = endpoint
    
    return manager

## Parameter Tuning Functions

In [None]:
def generate_parameter_combinations(parameter_grid: Dict[str, List]) -> List[Dict]:
    """Generate all possible parameter combinations from the grid."""
    keys = parameter_grid.keys()
    values = parameter_grid.values()
    combinations = [dict(zip(keys, v)) for v in product(*values)]
    return combinations

def calculate_aggregate_score(results: Dict[str, float], weights: Dict[str, float]) -> float:
    """Calculate weighted average of metrics."""
    score = 0
    for metric, value in results.items():
        if metric in weights:
            score += value * weights[metric]
    return score

def run_evaluation(implementation: Any, dataset: Dataset, evaluator: RAGMetricsEvaluator):
    """Run evaluation for a parameter combination."""
    print(f"Running evaluation...")
    total = len(dataset.examples)
    
    # Progress bar for overall evaluation
    with tqdm_notebook(total=total, desc="Evaluating") as pbar:
        # Generate answers with progress tracking
        questions = []
        contexts = []
        answers = []
        references = []
        
        for i, example in enumerate(dataset.examples):
            try:
                result = implementation.query(example.query)
                
                questions.append(example.query)
                contexts.append([doc['content'] for doc in result['context']])
                answers.append(result['response'])
                references.append(example.reference_answer)
                
                pbar.update(1)
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': 'Success'
                })
            except Exception as e:
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': f'Error: {type(e).__name__}'
                })
                raise
    
    # Create evaluation dataset
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "reference": references
    }
    eval_dataset = Dataset.from_dict(data)
    
    # Evaluate results
    print("\nCalculating metrics...")
    try:
        results = evaluator.evaluate_labeled(
            queries=questions,
            contexts=contexts,
            generated_answers=answers,
            reference_answers=references,
            plot_results=True
        )
        
        # Convert results to pandas DataFrame
        df = results.to_pandas()
        
        # Return both raw results and DataFrame
        return {
            'raw_results': results,
            'metrics_df': df.to_dict(),
            'data': data
        }
        
    except Exception as e:
        print(f"Error during evaluation: {type(e).__name__}")
        print(f"Error details: {str(e)}")
        print("\nDataset contents:")
        for key, value in data.items():
            print(f"\n{key}:")
            print(f"Type: {type(value)}")
            print(f"Length: {len(value)}")
            print(f"First item: {value[0][:100]}...")
        raise

## Run Parameter Tuning

In [None]:
# Initialize evaluator
evaluator = RAGMetricsEvaluator()

# Generate parameter combinations
parameter_combinations = generate_parameter_combinations(IMPLEMENTATION["parameter_grid"])
print(f"Generated {len(parameter_combinations)} parameter combinations to evaluate")

# Store results for each combination
all_results = []

# Run evaluation for each combination
for trial_id, params in enumerate(parameter_combinations):
    print(f"\nEvaluating combination {trial_id + 1}/{len(parameter_combinations)}")
    print("Parameters:", json.dumps(params, indent=2))
    
    # Set up OpenSearch
    manager = setup_opensearch(f"trial-{trial_id}")
    
    # Initialize implementation with parameters
    implementation = load_implementation(params)
    
    # Store results for this combination
    trial_results = {
        'trial_id': trial_id,
        'parameters': params,
        'results': {}
    }
    
    # Evaluate on tuning dataset
    for dataset_id, dataset_config in DATASETS["tuning"].items():
        print(f"\nEvaluating on {dataset_config['name']}...")
        
        # Load dataset
        dataset_dir = project_root / dataset_config["dir"]
        dataset, documents = load_labeled_dataset(dataset_dir)
        
        # Run evaluation
        results = run_evaluation(implementation, dataset, evaluator)
        trial_results['results'][dataset_id] = results
        
        # Calculate aggregate score
        metrics = {k: v.mean() for k, v in results['metrics_df'].items()}
        aggregate_score = calculate_aggregate_score(metrics, METRIC_WEIGHTS)
        trial_results['aggregate_score'] = aggregate_score
    
    # Store results
    all_results.append(trial_results)
    
    # Cleanup OpenSearch
    manager.cleanup()

## Save Results

In [None]:
def save_results(results: List[Dict[str, Any]]):
    """Save tuning results."""
    results_dir = project_root / "evaluation_pipelines/rag_evaluations/results"
    results_dir.mkdir(exist_ok=True)
    
    # Prepare results data
    results_data = {
        'implementation': IMPLEMENTATION,
        'datasets': DATASETS,
        'metric_weights': METRIC_WEIGHTS,
        'trials': results
    }
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = results_dir / f'rag_tuning_results_{timestamp}.json'
    with open(results_file, 'w') as f:
        json.dump(results_data, f, indent=2)
    
    print(f"Results saved to {results_file}")

# Save all results
save_results(all_results)

## Find Best Parameters

In [None]:
# Convert results to DataFrame for analysis
results_df = pd.DataFrame([
    {**trial['parameters'], 'score': trial['aggregate_score']}
    for trial in all_results
])

# Find best parameters
best_trial = max(all_results, key=lambda x: x['aggregate_score'])

print("Best Parameters:")
print(json.dumps(best_trial['parameters'], indent=2))
print(f"\nAggregate Score: {best_trial['aggregate_score']:.4f}")

## Validate Best Parameters

In [None]:
print("Validating best parameters...")

# Set up OpenSearch for validation
manager = setup_opensearch("validation")

# Initialize implementation with best parameters
implementation = load_implementation(best_trial['parameters'])

# Evaluate on validation datasets
validation_results = {}
for dataset_id, dataset_config in DATASETS["validation"].items():
    print(f"\nValidating on {dataset_config['name']}...")
    
    # Load dataset
    dataset_dir = project_root / dataset_config["dir"]
    dataset, documents = load_labeled_dataset(dataset_dir)
    
    # Run evaluation
    results = run_evaluation(implementation, dataset, evaluator)
    validation_results[dataset_id] = results
    
    # Calculate aggregate score
    metrics = {k: v.mean() for k, v in results['metrics_df'].items()}
    aggregate_score = calculate_aggregate_score(metrics, METRIC_WEIGHTS)
    
    print(f"\nValidation Results for {dataset_config['name']}:")
    print("Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print(f"Aggregate Score: {aggregate_score:.4f}")

# Cleanup OpenSearch
manager.cleanup()

## Analysis and Insights

Use this section to analyze the parameter tuning results:

1. Parameter Impact Analysis
   - How each parameter affects performance
   - Parameter interactions and dependencies
   - Sensitivity to parameter changes

2. Validation Performance
   - Generalization to validation dataset
   - Stability of performance
   - Potential overfitting concerns

3. Trade-offs
   - Performance vs. resource usage
   - Quality vs. speed
   - Complexity vs. maintainability

4. Recommendations
   - Optimal parameter settings
   - Parameter ranges to avoid
   - Future tuning suggestions