# GraphRAG Benchmarking

This notebook benchmarks our GraphRAG implementation using the Origin of Covid-19 dataset.

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Graph Construction
- min_entity_freq: Minimum frequency for entity inclusion (default: 2)
- max_relation_distance: Maximum token distance for relationships (default: 10)

### Hybrid Search
- k_graph: Number of graph-based results (default: 5)
- k_vector: Number of vector-based results (default: 3)
- alpha: Weight for combining scores (default: 0.7)
- search_type: Type of vector search ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score (default: None)

### API Settings
- max_retries: Maximum retry attempts (default: 5)
- min_delay: Minimum retry delay in seconds (default: 1)
- max_delay: Maximum retry delay in seconds (default: 60)

In [None]:
# Suppress CUDA warnings
import warnings
warnings.filterwarnings('ignore', message="Can't initialize NVML")

import os
import json
import sys
import boto3
import pandas as pd
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from datasets import Dataset

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    save_dataset_info
)
from utils.notebook_utils.importable import notebook_to_module

# Import graph RAG components
from rag_implementations.graph_rag.components import (
    calculate_graph_metrics,
    calculate_graph_coverage,
    calculate_graph_relevance,
    plot_graph_metrics
)

In [None]:
# Dataset Configuration
DATASET_NAME = "OriginOfCovid19Dataset"
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_EVAL_SAMPLES = None  # Set to a number for partial evaluation

# Neptune Configuration
NEPTUNE_CONFIG = {
    "cluster_name": "test-graph-rag-benchmark",
    "cleanup_enabled": True,  # For cost control
    "enable_audit": True,     # For detailed logging
    "max_retries": 10,        # More retries for initial connection
    "retry_delay": 5.0        # Start with longer delay
}

# RAG Configuration
RAG_CONFIG = {
    # Document processing
    "chunk_size": 500,  # 500 words ≈ 2000 chars
    "chunk_overlap": 50,  # 50 words overlap
    "enable_chunking": True,
    
    # Graph construction
    "min_entity_freq": 2,
    "max_relation_distance": 10,
    
    # Hybrid search
    "k_graph": 5,
    "k_vector": 3,
    "alpha": 0.7,
    "search_type": "script",
    "similarity_threshold": None,
    
    # OpenSearch config
    "index_settings": None,
    "knn_params": None,
    
    # API settings
    "max_retries": 5,
    "min_delay": 1.0,
    "max_delay": 60.0
}

print("Note: This notebook uses Amazon Neptune and OpenSearch which incur costs.")
print("Neptune and OpenSearch resources will be cleaned up after benchmarking.")
print("Set cleanup_enabled=False in NEPTUNE_CONFIG if you want to preserve the resources.")

In [None]:
# Import implementations
implementation_path = str(project_root / 'rag_implementations/graph_rag/implementation.ipynb')
ingestion_path = str(project_root / 'rag_implementations/graph_rag/ingestion.ipynb')
GraphRAG = notebook_to_module(implementation_path).GraphRAG
ingest_documents = notebook_to_module(ingestion_path).ingest_documents

# Initialize evaluator
evaluator = RAGMetricsEvaluator()

In [None]:
# Load and examine dataset
print(f"Loading {DATASET_NAME}...")
dataset, documents = load_labeled_dataset(DATASET_DIR, download_if_missing=True)
print(f"Loaded {len(dataset.examples)} examples and {len(documents)} documents")

# Get evaluation examples
eval_examples = dataset.examples[:NUM_EVAL_SAMPLES] if NUM_EVAL_SAMPLES else dataset.examples
print(f"Using {len(eval_examples)} examples for evaluation")

# Examine dataset structure
dataset_info = examine_dataset_structure(dataset, documents)
print("\nDataset Structure:")
print(json.dumps(dataset_info, indent=2))

# Save dataset info
save_dataset_info(dataset_info, DATASET_DIR / 'dataset_info.json')
print(f"\nDataset information saved to: {DATASET_DIR / 'dataset_info.json'}")

In [None]:
# Initialize GraphRAG with configuration
print("Initializing GraphRAG...")
rag = None
try:
    rag = GraphRAG(
        index_name=f"{DATASET_NAME.lower()}-benchmark",
        graph_store_config=NEPTUNE_CONFIG,
        **RAG_CONFIG
    )
    
    # Check if documents already exist
    print("\nChecking existing documents...")
    doc_count = rag.vector_store.opensearch.count(index=rag.index_name)['count']
    if doc_count > 0:
        print(f"Found {doc_count} documents already indexed")
        print("Skipping ingestion to avoid duplicates")
    else:
        print("Ingesting documents...")
        source_dir = DATASET_DIR / "source_files"
        ingest_documents(
            str(source_dir),
            rag,
            metadata={'dataset': DATASET_NAME},
            batch_size=100
        )
        
    # Print graph store status
    print("\nGraph Store Status:")
    if rag.graph_store and rag.graph_store._initialized:
        print("✅ Neptune connection successful")
        print(f"Endpoint: {rag.graph_store.graph.endpoint}")
    else:
        print("❌ Neptune connection not initialized")
        
except Exception as e:
    print(f"\nError during initialization: {str(e)}")
    if rag:
        # Clean up on initialization failure, but don't delete resources
        rag.cleanup(delete_resources=False)
    raise

In [None]:
# Run evaluation
print("Running evaluation...")
total = len(eval_examples)

# Progress bar for overall evaluation
with tqdm_notebook(total=total, desc="Evaluating") as pbar:
    # Generate answers with progress tracking
    questions = []
    contexts = []
    answers = []
    references = []
    graph_contexts = []
    graph_query_times = []
    
    for i, example in enumerate(eval_examples):
        try:
            result = rag.query(example.query)
            
            questions.append(example.query)
            contexts.append([doc['content'] for doc in result['context']])
            answers.append(result['response'])
            references.append(example.reference_answer)
            graph_contexts.append(result['graph_context'])
            
            # Track graph query performance
            if 'graph_query_time' in result:
                graph_query_times.append(result['graph_query_time'])
            
            pbar.update(1)
            pbar.set_postfix({
                'Query': f"{i+1}/{total}",
                'Status': 'Success'
            })
        except Exception as e:
            pbar.set_postfix({
                'Query': f"{i+1}/{total}",
                'Status': f'Error: {type(e).__name__}'
            })
            print(f"\nError processing query {i+1}: {str(e)}")
            continue

# Create evaluation dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "reference": references,
    "graph_contexts": graph_contexts
}
eval_dataset = Dataset.from_dict(data)

# Calculate standard RAG metrics
print("\nCalculating standard RAG metrics...")
rag_results = evaluator.evaluate_labeled(
    queries=questions,
    contexts=contexts,
    generated_answers=answers,
    reference_answers=references,
    plot_results=True
)

# Calculate graph-specific metrics
print("\nCalculating graph metrics...")
graph_metrics = calculate_graph_metrics(graph_contexts)

# Add performance metrics
if graph_query_times:
    graph_metrics['performance'] = {
        'avg_query_time': sum(graph_query_times) / len(graph_query_times),
        'min_query_time': min(graph_query_times),
        'max_query_time': max(graph_query_times)
    }

# Plot graph metrics
plot_graph_metrics(graph_metrics)

# Convert results to pandas DataFrame
df = rag_results.to_pandas()

# Add graph metrics
for metric, value in graph_metrics.items():
    if not isinstance(value, dict):
        df[f"graph_{metric}"] = value

In [None]:
# Save results
results_dir = project_root / "evaluation_pipelines/rag_evaluations/results"
results_dir.mkdir(exist_ok=True)

results_data = {
    'dataset': DATASET_NAME,
    'num_examples': len(dataset.examples),
    'num_documents': len(documents),
    'num_evaluated': len(eval_examples),
    'rag_config': RAG_CONFIG,
    'neptune_config': NEPTUNE_CONFIG,
    'metrics': df.to_dict(),
    'graph_metrics': graph_metrics,
    'evaluation_data': {
        'questions': questions,
        'answers': answers,
        'references': references
    }
}

results_file = results_dir / f'graph_rag_results_{DATASET_NAME.lower()}.json'
with open(results_file, 'w') as f:
    json.dump(results_data, f, indent=2)

print(f"Results saved to {results_file}")

In [None]:
# Example queries
example_queries = [
    "What is the main focus of the article 'The Origin of COVID-19 and Why It Matters'?",
    "What evidence suggests that SARS-CoV-2 emerged naturally rather than being engineered?",
    "What are some potential consequences of not understanding how COVID-19 emerged?"
]

print("Testing example queries...\n")
for query in example_queries:
    print(f"Query: {query}")
    result = rag.query(query)
    
    print("\nGraph Context:")
    for ctx in result['graph_context']:
        print(f"\nDocument {ctx['doc_id']}:")
        print("Entities:", ", ".join([f"{e['text']} ({e['label']})" for e in ctx['entities']]))
        print("Relations:", ", ".join([f"{r['from']} {r['label']} {r['to']}" for r in ctx['relations']]))
    
    print(f"\nResponse: {result['response']}\n")
    print("-" * 80)

In [None]:
# Resource Cleanup
if NEPTUNE_CONFIG['cleanup_enabled']:
    print("=== Cleaning Up Resources ===")
    print("Warning: This will delete all resources and indexed data")
    print("This may take 15-20 minutes to complete")
    try:
        if rag:
            # Delete resources and wait for deletion to complete
            rag.cleanup(delete_resources=True)
            
            # Wait for OpenSearch domain deletion
            if rag.vector_store and rag.vector_store.opensearch_manager:
                rag.vector_store.opensearch_manager._wait_for_deletion()
            
            # Wait for Neptune deletion
            if rag.graph_store and rag.graph_store.neptune_manager:
                rag.graph_store.neptune_manager.cleanup()
                
            print("✅ Cleanup successful")
    except Exception as e:
        print(f"❌ Error during cleanup: {str(e)}")
        print("Some resources may need to be cleaned up manually")