# GraphRAG Document Ingestion

This notebook handles document processing and graph construction for the GraphRAG implementation.

## Process Overview

1. Load and preprocess documents
2. Extract entities and relationships
3. Construct knowledge graph
4. Store document vectors
5. Update graph indices

## Configuration

- Batch size for processing
- Entity extraction settings
- Relationship confidence thresholds
- Graph storage parameters

In [None]:
import os
import sys
import json
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.notebook_utils.document_utils import load_documents, process_documents
from utils.notebook_utils.importable import notebook_to_module

In [None]:
def ingest_documents(
    source_dir: str,
    rag_instance: Any,
    metadata: Optional[Dict[str, Any]] = None,
    batch_size: int = 100
) -> None:
    """
    Process documents and construct graph representation.
    
    Args:
        source_dir: Directory containing source documents
        rag_instance: GraphRAG instance
        metadata: Optional metadata to attach to documents
        batch_size: Number of documents to process in each batch
    """
    # Load documents
    print("Loading documents...")
    documents = load_documents(source_dir)
    
    # Process in batches
    total_batches = (len(documents) + batch_size - 1) // batch_size
    
    for i in tqdm(range(0, len(documents), batch_size), total=total_batches):
        batch = documents[i:i + batch_size]
        
        # Process batch
        processed_docs = process_documents(
            batch,
            chunk_size=rag_instance.chunk_size,
            chunk_overlap=rag_instance.chunk_overlap,
            enable_chunking=rag_instance.enable_chunking
        )
        
        # Process each document
        for doc in processed_docs:
            # Add metadata
            if metadata:
                doc["metadata"].update(metadata)
            
            # Extract graph data
            graph_data = rag_instance._extract_entities_relations(doc["content"])
            
            # Store in graph database
            rag_instance._store_graph_data(doc["id"], graph_data)
            
            # Get document embedding
            embedding = rag_instance.llm.get_embedding(doc["content"])
            
            # Store in vector index
            rag_instance.opensearch.index(
                index=rag_instance.index_name,
                id=doc["id"],
                body={
                    "content": doc["content"],
                    "vector": embedding,
                    "metadata": doc["metadata"]
                }
            )
    
    print(f"Processed {len(documents)} documents")