# GraphRAG Document Ingestion

This notebook handles document processing and graph construction for the GraphRAG implementation.

## Process Overview
1. Load and preprocess documents using Langchain
2. Extract entities and relationships
3. Construct knowledge graph
4. Store document vectors
5. Update graph indices

## Configuration
- Document processing settings
- Entity extraction parameters
- Graph storage configuration
- Vector storage settings

In [None]:
import os
import sys
import json
import uuid
from pathlib import Path
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import components
from components.document_processor import DocumentProcessor
from components.graph_store import GraphStore
from components.vector_store import VectorStore
from components.response_generator import ResponseGenerator

In [None]:
def ingest_documents(
    source_path: str,
    rag_instance: Any,
    metadata: Optional[Dict] = None,
    batch_size: int = 100
) -> None:
    """Process documents and construct graph representation.
    
    Args:
        source_path: Path to file or directory
        rag_instance: GraphRAG instance
        metadata: Optional metadata to add to all documents
        batch_size: Number of documents to process in each batch
    """
    # Process documents
    if os.path.isfile(source_path):
        documents = rag_instance.doc_processor.process_files(
            [source_path],
            metadata=metadata
        )
    elif os.path.isdir(source_path):
        documents = rag_instance.doc_processor.process_directory(
            source_path,
            metadata=metadata
        )
    else:
        raise ValueError(f"Invalid source path: {source_path}")
    
    # Process in batches
    for i in tqdm(range(0, len(documents), batch_size), desc="Processing documents"):
        batch = documents[i:i + batch_size]
        
        # Process each document
        for doc in batch:
            # Store in graph database
            rag_instance.graph_store.store_document(
                doc_id=doc["id"],
                content=doc["content"],
                metadata=doc["metadata"],
                graph_data=doc["graph_data"]
            )
            
            # Get document embedding
            embedding = rag_instance.response_generator.llm.get_embedding(
                doc["content"]
            )
            
            # Store in vector index
            rag_instance.vector_store.store_document(
                doc_id=doc["id"],
                content=doc["content"],
                vector=embedding,
                metadata=doc["metadata"]
            )
    
    print(f"Processed {len(documents)} documents")