# GraphRAG Benchmarking

This notebook benchmarks our GraphRAG implementation using the Origin of Covid-19 dataset for comparison with baseline RAG.

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Graph Construction
- min_entity_freq: Minimum frequency for entity inclusion (default: 2)
- max_relation_distance: Maximum token distance for relationships (default: 10)
- confidence_threshold: Minimum confidence for relations (default: 0.5)

### Hybrid Search
- k_graph: Number of graph-based results (default: 5)
- k_vector: Number of vector-based results (default: 3)
- alpha: Weight for combining scores (default: 0.7)

### Neptune Settings
- instance_type: Neptune instance type (default: 'db.r6g.xlarge')
- enable_audit: Enable audit logging (default: True)

### API Settings
- max_retries: Maximum number of retry attempts (default: 5)
- min_delay: Minimum delay between retries in seconds (default: 1)
- max_delay: Maximum delay between retries in seconds (default: 60)

## Prerequisites
- Run setup.ipynb first to configure environment
- Neptune cluster must be configured
- SpaCy model must be downloaded

In [None]:
import os
import json
import sys
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from datasets import Dataset

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.metrics.rag_metrics import RAGMetricsEvaluator
from utils.notebook_utils.dataset_utils import (
    load_labeled_dataset,
    examine_dataset_structure,
    save_dataset_info
)
from utils.notebook_utils.importable import notebook_to_module

In [None]:
# Dataset Configuration
DATASET_NAME = "OriginOfCovid19Dataset"
DATASET_DIR = project_root / "datasets/rag_evaluation/labeled/covid19_origin"
NUM_EVAL_SAMPLES = None  # Set to a number for partial evaluation

# Neptune Configuration
NEPTUNE_INSTANCE = "graph-rag-benchmark-store"
CLEANUP_RESOURCES = True  # Default to cleaning up to avoid unexpected costs

# RAG Configuration
RAG_CONFIG = {
    # Document processing
    "chunk_size": 500,
    "chunk_overlap": 50,
    "enable_chunking": True,
    
    # Graph construction
    "min_entity_freq": 2,
    "max_relation_distance": 10,
    "confidence_threshold": 0.5,
    
    # Hybrid search
    "k_graph": 5,
    "k_vector": 3,
    "alpha": 0.7,
    
    # Neptune settings
    "instance_type": "db.r6g.xlarge",
    "enable_audit": True,
    
    # API settings
    "max_retries": 5,
    "min_delay": 1.0,
    "max_delay": 60.0
}

print("Note: This notebook uses Amazon Neptune which incurs costs.")
print("CLEANUP_RESOURCES is enabled by default to delete resources after benchmarking.")
print("Set CLEANUP_RESOURCES = False if you want to preserve the Neptune instance.")

In [None]:
# Import implementations
implementation_path = str(project_root / 'rag_implementations/graph_rag/implementation.ipynb')
ingestion_path = str(project_root / 'rag_implementations/graph_rag/ingestion.ipynb')
GraphRAG = notebook_to_module(implementation_path).GraphRAG
ingest_documents = notebook_to_module(ingestion_path).ingest_documents

# Initialize evaluator
evaluator = RAGMetricsEvaluator()

In [None]:
# Load and examine dataset
print(f"Loading {DATASET_NAME}...")
dataset, documents = load_labeled_dataset(DATASET_DIR, download_if_missing=True)
print(f"Loaded {len(dataset.examples)} examples and {len(documents)} documents")

# Get evaluation examples
eval_examples = dataset.examples[:NUM_EVAL_SAMPLES] if NUM_EVAL_SAMPLES else dataset.examples
print(f"Using {len(eval_examples)} examples for evaluation")

# Examine dataset structure
dataset_info = examine_dataset_structure(dataset, documents)
print("\nDataset Structure:")
print(json.dumps(dataset_info, indent=2))

# Save dataset info
save_dataset_info(dataset_info, DATASET_DIR / 'dataset_info.json')
print(f"\nDataset information saved to: {DATASET_DIR / 'dataset_info.json'}")

In [None]:
# Initialize GraphRAG with configuration
print("Initializing GraphRAG...")
rag = GraphRAG(
    index_name=f"{DATASET_NAME.lower()}-benchmark",
    chunk_size=RAG_CONFIG["chunk_size"],
    chunk_overlap=RAG_CONFIG["chunk_overlap"],
    enable_chunking=RAG_CONFIG["enable_chunking"],
    min_entity_freq=RAG_CONFIG["min_entity_freq"],
    max_relation_distance=RAG_CONFIG["max_relation_distance"],
    confidence_threshold=RAG_CONFIG["confidence_threshold"],
    k_graph=RAG_CONFIG["k_graph"],
    k_vector=RAG_CONFIG["k_vector"],
    alpha=RAG_CONFIG["alpha"],
    instance_type=RAG_CONFIG["instance_type"],
    enable_audit=RAG_CONFIG["enable_audit"],
    max_retries=RAG_CONFIG["max_retries"],
    min_delay=RAG_CONFIG["min_delay"],
    max_delay=RAG_CONFIG["max_delay"]
)

# Ingest documents
print("\nIngesting documents...")
source_dir = DATASET_DIR / "source_files"
ingest_documents(
    str(source_dir),
    rag,
    metadata={'dataset': DATASET_NAME},
    batch_size=100
)

In [None]:
# Run evaluation
def run_evaluation():
    print("Running evaluation...")
    total = len(eval_examples)
    
    # Progress bar for overall evaluation
    with tqdm_notebook(total=total, desc="Evaluating") as pbar:
        # Generate answers with progress tracking
        questions = []
        contexts = []
        answers = []
        references = []
        graph_contexts = []
        
        for i, example in enumerate(eval_examples):
            try:
                result = rag.query(example.query)
                
                questions.append(example.query)
                contexts.append([doc['content'] for doc in result['context']])
                answers.append(result['response'])
                references.append(example.reference_answer)
                graph_contexts.append(result['graph_context'])
                
                pbar.update(1)
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': 'Success'
                })
            except Exception as e:
                pbar.set_postfix({
                    'Query': f"{i+1}/{total}",
                    'Status': f'Error: {type(e).__name__}'
                })
                raise
    
    # Create evaluation dataset
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "reference": references,
        "graph_contexts": graph_contexts
    }
    eval_dataset = Dataset.from_dict(data)
    
    # Evaluate results
    print("\nCalculating metrics...")
    try:
        # Standard RAG metrics
        results = evaluator.evaluate_labeled(
            queries=questions,
            contexts=contexts,
            generated_answers=answers,
            reference_answers=references,
            plot_results=True
        )
        
        # Graph-specific metrics
        graph_metrics = calculate_graph_metrics(graph_contexts)
        
        # Combine metrics
        df = results.to_pandas()
        df = pd.concat([df, pd.DataFrame([graph_metrics])], axis=1)
        
        # Return both raw results and DataFrame
        return {
            'raw_results': results,
            'metrics_df': df.to_dict(),
            'data': data
        }
        
    except Exception as e:
        print(f"Error during evaluation: {type(e).__name__}")
        print(f"Error details: {str(e)}")
        print("\nDataset contents:")
        for key, value in data.items():
            print(f"\n{key}:")
            print(f"Type: {type(value)}")
            print(f"Length: {len(value)}")
            print(f"First item: {value[0][:100]}...")
        raise

def calculate_graph_metrics(graph_contexts):
    """Calculate graph-specific metrics."""
    total_entities = 0
    total_relations = 0
    entity_types = {}
    relation_types = {}
    
    for ctx in graph_contexts:
        for doc_ctx in ctx:
            # Count entities
            doc_entities = doc_ctx['entities']
            total_entities += len(doc_entities)
            
            # Track entity types
            for entity in doc_entities:
                entity_type = entity['label']
                entity_types[entity_type] = entity_types.get(entity_type, 0) + 1
            
            # Count relations
            doc_relations = doc_ctx['relations']
            total_relations += len(doc_relations)
            
            # Track relation types
            for relation in doc_relations:
                relation_type = relation['label']
                relation_types[relation_type] = relation_types.get(relation_type, 0) + 1
    
    # Calculate averages
    num_contexts = len(graph_contexts)
    avg_entities = total_entities / num_contexts if num_contexts > 0 else 0
    avg_relations = total_relations / num_contexts if num_contexts > 0 else 0
    
    return {
        'avg_entities_per_context': avg_entities,
        'avg_relations_per_context': avg_relations,
        'unique_entity_types': len(entity_types),
        'unique_relation_types': len(relation_types)
    }

results = run_evaluation()

In [None]:
# Save results
results_dir = project_root / "evaluation_pipelines/rag_evaluations/results"
results_dir.mkdir(exist_ok=True)

results_data = {
    'dataset': DATASET_NAME,
    'num_examples': len(dataset.examples),
    'num_documents': len(documents),
    'num_evaluated': len(eval_examples),
    'rag_config': RAG_CONFIG,
    'metrics': results['metrics_df'],
    'evaluation_data': {
        'questions': results['data']['question'],
        'answers': results['data']['answer'],
        'references': results['data']['reference']
    }
}

results_file = results_dir / f'graph_rag_results_{DATASET_NAME.lower()}.json'
with open(results_file, 'w') as f:
    json.dump(results_data, f, indent=2)

print(f"Results saved to {results_file}")

In [None]:
# Plot graph metrics
def plot_graph_metrics(metrics_df):
    """Plot graph-specific metrics."""
    plt.figure(figsize=(12, 6))
    
    # Plot averages
    plt.subplot(1, 2, 1)
    averages = [
        metrics_df['avg_entities_per_context'].iloc[0],
        metrics_df['avg_relations_per_context'].iloc[0]
    ]
    plt.bar(['Entities', 'Relations'], averages)
    plt.title('Average Entities and Relations per Context')
    plt.ylabel('Count')
    
    # Plot unique types
    plt.subplot(1, 2, 2)
    unique_types = [
        metrics_df['unique_entity_types'].iloc[0],
        metrics_df['unique_relation_types'].iloc[0]
    ]
    plt.bar(['Entity Types', 'Relation Types'], unique_types)
    plt.title('Unique Entity and Relation Types')
    plt.ylabel('Count')
    
    plt.tight_layout()
    plt.show()

# Plot metrics
metrics_df = pd.DataFrame(results['metrics_df'])
plot_graph_metrics(metrics_df)

In [None]:
# Example queries
example_queries = [
    "What is the main focus of the article 'The Origin of COVID-19 and Why It Matters'?",
    "What evidence suggests that SARS-CoV-2 emerged naturally rather than being engineered?",
    "What are some potential consequences of not understanding how COVID-19 emerged?"
]

print("Testing example queries...\n")
for query in example_queries:
    print(f"Query: {query}")
    result = rag.query(query)
    
    print("\nGraph Context:")
    for ctx in result['graph_context']:
        print(f"\nDocument {ctx['doc_id']}:")
        print("Entities:", ", ".join([f"{e['text']} ({e['label']})" for e in ctx['entities']]))
        print("Relations:", ", ".join([f"{r['from']} {r['label']} {r['to']}" for r in ctx['relations']]))
    
    print(f"\nResponse: {result['response']}\n")
    print("-" * 80)

In [None]:
# Resource Cleanup
if CLEANUP_RESOURCES:
    print("Cleaning up resources...")
    rag.neptune_manager.cleanup()
    rag.opensearch.cleanup()