# GraphRAG Implementation

This notebook implements a Graph-based RAG system using Neptune for graph storage and hybrid search combining graph and vector retrieval.

## Features
- Document processing with entity/relation extraction
- Graph storage in Neptune
- Vector storage in OpenSearch
- Hybrid search combining graph and vector similarity
- Context-aware response generation

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Graph Construction
- min_entity_freq: Minimum frequency for entity inclusion (default: 2)
- max_relation_distance: Maximum token distance for relationships (default: 10)

### Hybrid Search
- k_graph: Number of graph-based results (default: 5)
- k_vector: Number of vector-based results (default: 3)
- alpha: Weight for combining scores (default: 0.7)
- search_type: Type of vector search ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score (default: None)

### API Settings
- max_retries: Maximum retry attempts (default: 5)
- min_delay: Minimum retry delay in seconds (default: 1)
- max_delay: Maximum retry delay in seconds (default: 60)

In [None]:
import os
import sys
import json
import time
import uuid
from pathlib import Path
from typing import List, Dict, Any, Optional, Literal

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import components
from rag_implementations.graph_rag.components.document_processor import DocumentProcessor
from rag_implementations.graph_rag.components.graph_store import GraphStore
from rag_implementations.graph_rag.components.vector_store import VectorStore
from rag_implementations.graph_rag.components.hybrid_search import HybridSearch
from rag_implementations.graph_rag.components.response_generator import ResponseGenerator

In [None]:
class GraphRAG:
    """Graph-based RAG implementation using Neptune and OpenSearch."""
    
    def __init__(
        self,
        index_name: str,
        # Document processing config
        chunk_size: int = 500,
        chunk_overlap: int = 50,
        enable_chunking: bool = True,
        # Graph construction config
        min_entity_freq: int = 2,
        max_relation_distance: int = 10,
        # Hybrid search config
        k_graph: int = 5,
        k_vector: int = 3,
        alpha: float = 0.7,
        search_type: Literal['script', 'knn'] = 'script',
        similarity_threshold: Optional[float] = None,
        # OpenSearch config
        index_settings: Optional[Dict] = None,
        knn_params: Optional[Dict] = None,
        # Neptune config
        graph_store_config: Optional[Dict] = None,
        # API config
        max_retries: int = 5,
        min_delay: float = 1.0,
        max_delay: float = 60.0
    ):
        """Initialize GraphRAG with configuration parameters."""
        self.index_name = index_name
        
        # Track component state
        self._initialized = False
        self.doc_processor = None
        self.vector_store = None
        self.graph_store = None
        self.hybrid_search = None
        self.response_generator = None
        
        try:
            # Initialize components in order
            self.doc_processor = DocumentProcessor(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                enable_chunking=enable_chunking,
                min_entity_freq=min_entity_freq,
                max_relation_distance=max_relation_distance
            )
            
            # Vector store first (faster to set up)
            self.vector_store = VectorStore(
                index_name=index_name,
                search_type=search_type,
                similarity_threshold=similarity_threshold,
                index_settings=index_settings,
                knn_params=knn_params
            )
            
            # Graph store with proper error handling
            try:
                # Use provided config or defaults
                default_config = {
                    "cluster_name": f"graph-rag-{index_name}",
                    "enable_audit": True
                }
                
                # Filter config to only include supported parameters
                if graph_store_config:
                    graph_config = {
                        "cluster_name": graph_store_config.get("cluster_name", default_config["cluster_name"]),
                        "enable_audit": graph_store_config.get("enable_audit", default_config["enable_audit"])
                    }
                else:
                    graph_config = default_config
                    
                self.graph_store = GraphStore(**graph_config)
            except Exception as e:
                # Clean up vector store if graph store fails
                self.vector_store.cleanup(delete_resources=False)
                raise
            
            # Initialize search after both stores ready
            self.hybrid_search = HybridSearch(
                graph_store=self.graph_store,
                vector_store=self.vector_store,
                k_graph=k_graph,
                k_vector=k_vector,
                alpha=alpha
            )
            
            self.response_generator = ResponseGenerator(
                max_retries=max_retries,
                min_delay=min_delay,
                max_delay=max_delay
            )
            
            self._initialized = True
            
        except Exception as e:
            # Clean up on initialization failure, but don't delete resources
            self.cleanup(delete_resources=False)
            raise Exception(f"Failed to initialize GraphRAG: {str(e)}") from e
    
    def ensure_initialized(self):
        """Ensure all components are properly initialized."""
        if not self._initialized:
            raise RuntimeError("GraphRAG not properly initialized")
        if not self.graph_store:
            raise RuntimeError("Graph store not available")
        if not self.vector_store:
            raise RuntimeError("Vector store not available")
    
    def query(self, query: str) -> Dict[str, Any]:
        """Process a query using graph-augmented retrieval.
        
        Args:
            query: User query
            
        Returns:
            Dictionary containing response and context
        """
        self.ensure_initialized()
        
        try:
            # Extract entities from query
            query_graph = self.doc_processor._extract_entities_relations(query)
            
            # Get query embedding from vector store
            query_vector = self.vector_store.get_embedding(query)
            
            # Track query timing
            start_time = time.time()
            
            # Perform hybrid search
            try:
                search_results = self.hybrid_search.search(
                    query_text=query,
                    query_vector=query_vector,
                    query_entities=query_graph["entities"]
                )
            except Exception as e:
                print(f"Hybrid search failed: {str(e)}")
                # Fall back to vector search only
                search_results = self.vector_store.search(
                    query_vector=query_vector,
                    k=5
                )
            
            # Get graph context for each result
            graph_context = []
            for result in search_results:
                try:
                    doc_entities = self.graph_store.get_document_entities(result["id"])
                    doc_relations = self.graph_store.get_document_relations(result["id"])
                    
                    graph_context.append({
                        "doc_id": result["id"],
                        "entities": doc_entities,
                        "relations": doc_relations
                    })
                except Exception as e:
                    print(f"Failed to get graph context for document {result['id']}: {str(e)}")
                    # Skip this document's graph context
                    continue
            
            # Calculate query time
            query_time = time.time() - start_time
            
            # Generate response
            response = self.response_generator.generate(
                query=query,
                search_results=search_results,
                graph_context=graph_context
            )
            
            return {
                "query": query,
                "response": response,
                "context": search_results,
                "graph_context": graph_context,
                "graph_query_time": query_time
            }
            
        except Exception as e:
            raise Exception(f"Failed to process query: {str(e)}") from e
    
    def cleanup(self, delete_resources: bool = False):
        """Clean up all resources in reverse initialization order.
        
        Args:
            delete_resources: Whether to delete Neptune/OpenSearch resources
        """
        if hasattr(self, 'hybrid_search'):
            # Just clear reference
            self.hybrid_search = None
            
        if hasattr(self, 'response_generator'):
            self.response_generator = None
            
        if hasattr(self, 'graph_store'):
            try:
                self.graph_store.cleanup(delete_resources=delete_resources)
            except:
                pass  # Best effort cleanup
            self.graph_store = None
            
        if hasattr(self, 'vector_store'):
            try:
                self.vector_store.cleanup(delete_resources=delete_resources)
            except:
                pass  # Best effort cleanup
            self.vector_store = None
            
        if hasattr(self, 'doc_processor'):
            self.doc_processor = None
            
        self._initialized = False