# GraphRAG Document Ingestion

This notebook handles document processing and graph construction for the GraphRAG implementation.

## Process Overview

1. Load and preprocess documents using Langchain
2. Extract entities and relationships
3. Construct knowledge graph
4. Store document vectors
5. Update graph indices

## Configuration

- Batch size for processing
- Entity extraction settings
- Relationship confidence thresholds
- Graph storage parameters

In [None]:
import os
from pathlib import Path
from typing import List, Dict, Any, Optional

from langchain.document_loaders import (
    PyPDFLoader,
    TextLoader,
    Docx2txtLoader,
    UnstructuredFileLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from tqdm.auto import tqdm

# Add project root to path for imports
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.notebook_utils.importable import notebook_to_module

In [None]:
def process_documents(
    file_paths: List[str], 
    chunk_size: int = 500, 
    chunk_overlap: int = 50, 
    enable_chunking: bool = True,
    metadata: Optional[Dict] = None
) -> List[Dict[str, Any]]:
    """Process documents using Langchain's document loaders and text splitter
    
    Args:
        file_paths: List of paths to documents
        chunk_size: Maximum number of characters per chunk
        chunk_overlap: Number of characters to overlap between chunks
        enable_chunking: Whether to split documents into chunks
        metadata: Optional metadata to add to all documents
        
    Returns:
        List of processed documents ready for RAG ingestion
    """
    # Initialize text splitter if chunking is enabled
    text_splitter = None
    if enable_chunking:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    # Map file extensions to loaders
    loaders = {
        '.pdf': PyPDFLoader,
        '.txt': TextLoader,
        '.docx': Docx2txtLoader,
        '*': UnstructuredFileLoader
    }
    
    processed_docs = []
    for file_path in file_paths:
        try:
            # Get appropriate loader
            ext = Path(file_path).suffix.lower()
            loader_cls = loaders.get(ext, loaders['*'])
            
            # Load document
            loader = loader_cls(file_path)
            docs = loader.load()
            
            # Add file info to metadata
            file_metadata = metadata.copy() if metadata else {}
            file_metadata.update({
                'source_file': file_path,
                'file_type': ext,
                'file_name': Path(file_path).name
            })
            
            # Add metadata to documents
            for doc in docs:
                doc.metadata.update(file_metadata)
            
            if enable_chunking:
                # Split into chunks
                chunks = text_splitter.split_documents(docs)
                
                # Convert to RAG format
                for i, chunk in enumerate(chunks):
                    chunk_metadata = chunk.metadata.copy()
                    chunk_metadata.update({
                        'chunk_index': i,
                        'total_chunks': len(chunks)
                    })
                    
                    processed_docs.append({
                        'content': chunk.page_content,
                        'metadata': chunk_metadata
                    })
            else:
                # Keep documents as is
                for doc in docs:
                    processed_docs.append({
                        'content': doc.page_content,
                        'metadata': doc.metadata
                    })
                
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            continue
    
    return processed_docs

def process_directory(
    dir_path: str, 
    chunk_size: int = 500,
    chunk_overlap: int = 50,
    enable_chunking: bool = True,
    metadata: Optional[Dict] = None, 
    recursive: bool = True
) -> List[Dict[str, Any]]:
    """Process all documents in a directory
    
    Args:
        dir_path: Path to directory
        chunk_size: Maximum number of characters per chunk
        chunk_overlap: Number of characters to overlap between chunks
        enable_chunking: Whether to split documents into chunks
        metadata: Optional metadata to add to all documents
        recursive: Whether to process subdirectories
        
    Returns:
        List of processed documents ready for RAG ingestion
    """
    # Get all files
    if recursive:
        file_paths = []
        for root, _, files in os.walk(dir_path):
            for file in files:
                if file.endswith(('.txt', '.pdf', '.docx')):  # Add more extensions as needed
                    file_paths.append(os.path.join(root, file))
    else:
        file_paths = [
            os.path.join(dir_path, f) for f in os.listdir(dir_path)
            if f.endswith(('.txt', '.pdf', '.docx'))
        ]
    
    return process_documents(
        file_paths, 
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        enable_chunking=enable_chunking,
        metadata=metadata
    )

def ingest_documents(
    source_path: str, 
    rag_instance: Any, 
    metadata: Optional[Dict] = None, 
    batch_size: int = 100
) -> None:
    """Ingest documents into GraphRAG system
    
    Args:
        source_path: Path to file or directory
        rag_instance: GraphRAG instance
        metadata: Optional metadata to add to all documents
        batch_size: Number of documents to process in each batch
    """
    # Process documents
    if os.path.isfile(source_path):
        documents = process_documents(
            [source_path], 
            chunk_size=rag_instance.chunk_size,
            chunk_overlap=rag_instance.chunk_overlap,
            enable_chunking=rag_instance.enable_chunking,
            metadata=metadata
        )
    elif os.path.isdir(source_path):
        documents = process_directory(
            source_path, 
            chunk_size=rag_instance.chunk_size,
            chunk_overlap=rag_instance.chunk_overlap,
            enable_chunking=rag_instance.enable_chunking,
            metadata=metadata
        )
    else:
        raise ValueError(f"Invalid source path: {source_path}")
    
    # Process in batches
    for i in tqdm(range(0, len(documents), batch_size), desc="Processing documents"):
        batch = documents[i:i + batch_size]
        
        # Process each document
        for doc in batch:
            # Add document ID if not present
            if 'id' not in doc:
                doc['id'] = f"doc_{i}"
                
            # Extract graph data
            graph_data = rag_instance._extract_entities_relations(doc["content"])
            
            # Store in graph database
            rag_instance._store_graph_data(doc["id"], graph_data)
            
            # Get document embedding
            embedding = rag_instance.llm.get_embedding(doc["content"])
            
            # Store in vector index
            rag_instance.opensearch.index(
                index=rag_instance.index_name,
                id=doc["id"],
                body={
                    "content": doc["content"],
                    "vector": embedding,
                    "metadata": doc["metadata"]
                }
            )
    
    print(f"Processed {len(documents)} documents")