# GraphRAG Document Ingestion

This notebook handles document processing and graph construction for the GraphRAG implementation.

## Process Overview
1. Load and preprocess documents using Langchain
2. Extract entities and relationships
3. Construct knowledge graph
4. Store document vectors
5. Update graph indices

## Configuration
- Document processing settings
- Entity extraction parameters
- Graph storage configuration
- Vector storage settings

In [None]:
import os
import sys
import json
import uuid
import time
import random
import boto3
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from botocore.exceptions import ClientError

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import components
from rag_implementations.graph_rag.components.document_processor import DocumentProcessor
from rag_implementations.graph_rag.components.graph_store import GraphStore
from rag_implementations.graph_rag.components.vector_store import VectorStore
from rag_implementations.graph_rag.components.response_generator import ResponseGenerator

In [None]:
def get_embedding(
    text: str,
    model_id: str = "cohere.embed-english-v3",
    max_retries: int = 5,
    min_delay: float = 1.0,
    max_delay: float = 60.0
) -> Optional[List[float]]:
    """Generate embeddings using Cohere model with retry.
    
    Args:
        text: Text to embed
        model_id: Bedrock model ID
        max_retries: Maximum number of retry attempts
        min_delay: Minimum delay between retries in seconds
        max_delay: Maximum delay between retries in seconds
        
    Returns:
        Embedding vector or None if error
    """
    bedrock = boto3.client('bedrock-runtime')
    
    request_body = {
        "texts": [text],
        "input_type": "search_document"
    }
    
    last_exception = None
    for attempt in range(max_retries):
        try:
            response = bedrock.invoke_model(
                modelId=model_id,
                body=json.dumps(request_body)
            )
            response_body = json.loads(response['body'].read())
            return response_body['embeddings'][0]
            
        except ClientError as e:
            last_exception = e
            if e.response['Error']['Code'] == 'ThrottlingException':
                if attempt == max_retries - 1:
                    print(f"Max retries exceeded for text: {text[:100]}...")
                    return None
                # Exponential backoff with jitter
                delay = min(
                    max_delay,
                    min_delay * (2 ** attempt) + random.uniform(0, 1)
                )
                time.sleep(delay)
            else:
                print(f"Error getting embeddings: {str(e)}")
                print(f"Text length: {len(text)} chars, {len(text.split())} words")
                return None
                
        except Exception as e:
            print(f"Error getting embeddings: {str(e)}")
            print(f"Text length: {len(text)} chars, {len(text.split())} words")
            return None
    
    print(f"Failed after {max_retries} attempts: {str(last_exception)}")
    return None

In [None]:
def ingest_documents(
    source_path: str,
    rag_instance: Any,
    metadata: Optional[Dict] = None,
    batch_size: int = 100
) -> None:
    """Process documents and construct graph representation.
    
    Args:
        source_path: Path to file or directory
        rag_instance: GraphRAG instance
        metadata: Optional metadata to add to all documents
        batch_size: Number of documents to process in each batch
    """
    # Process documents
    if os.path.isfile(source_path):
        documents = rag_instance.doc_processor.process_files(
            [source_path],
            metadata=metadata
        )
    elif os.path.isdir(source_path):
        documents = rag_instance.doc_processor.process_directory(
            source_path,
            metadata=metadata
        )
    else:
        raise ValueError(f"Invalid source path: {source_path}")
    
    # Track success/failure counts
    success_count = 0
    failure_count = 0
    
    # Process in batches
    for i in tqdm(range(0, len(documents), batch_size), desc="Processing documents"):
        batch = documents[i:i + batch_size]
        
        # Prepare batch for storage
        vector_docs = []
        graph_docs = []
        
        # Process each document
        for doc in batch:
            try:
                # Generate unique ID
                doc_id = str(uuid.uuid4())
                
                # Get document embedding first
                embedding = get_embedding(doc["content"])
                if embedding is None:
                    print(f"Skipping document {doc_id} due to embedding error")
                    failure_count += 1
                    continue
                
                # Prepare document for vector store
                vector_docs.append({
                    'content': doc["content"],
                    'vector': embedding,
                    'metadata': doc["metadata"]
                })
                
                # Prepare document for graph store
                graph_docs.append({
                    'doc_id': doc_id,
                    'content': doc["content"],
                    'metadata': doc["metadata"],
                    'graph_data': doc["graph_data"]
                })
                
                success_count += 1
                
            except Exception as e:
                print(f"Error processing document: {str(e)}")
                failure_count += 1
                continue
        
        # Store batch in vector store
        if vector_docs:
            try:
                rag_instance.vector_store.store_documents(vector_docs)
            except Exception as e:
                print(f"Error storing vectors: {str(e)}")
                failure_count += len(vector_docs)
                success_count -= len(vector_docs)
                continue
        
        # Store batch in graph store
        if graph_docs:
            try:
                for doc in graph_docs:
                    rag_instance.graph_store.store_document(
                        doc_id=doc['doc_id'],
                        content=doc['content'],
                        metadata=doc['metadata'],
                        graph_data=doc['graph_data']
                    )
            except Exception as e:
                print(f"Error storing graph data: {str(e)}")
                failure_count += len(graph_docs)
                success_count -= len(graph_docs)
                continue
    
    print(f"\nIngestion complete:")
    print(f"Successfully processed: {success_count} documents")
    print(f"Failed to process: {failure_count} documents")