# Document Ingestion for Baseline RAG

This notebook demonstrates document ingestion for the baseline RAG system using Langchain's document processing utilities.

## Features
- Robust document loading with specialized loaders for different file types
- Smart text splitting that preserves context
- Metadata preservation and enhancement
- Batch processing for large document sets

## Usage
1. Load source material (text files, PDFs, etc.)
2. Process and chunk documents
3. Ingest into RAG system

In [None]:
import os
from pathlib import Path
from typing import List, Dict, Any, Optional

from langchain.document_loaders import (
    PyPDFLoader,
    TextLoader,
    Docx2txtLoader,
    UnstructuredFileLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [None]:
def process_documents(
    file_paths: List[str], 
    chunk_size: int = 500, 
    chunk_overlap: int = 50, 
    enable_chunking: bool = True,
    metadata: Optional[Dict] = None
) -> List[Dict[str, Any]]:
    """Process documents using Langchain's document loaders and text splitter
    
    Args:
        file_paths: List of paths to documents
        chunk_size: Maximum number of characters per chunk
        chunk_overlap: Number of characters to overlap between chunks
        enable_chunking: Whether to split documents into chunks
        metadata: Optional metadata to add to all documents
        
    Returns:
        List of processed documents ready for RAG ingestion
    """
    # Initialize text splitter if chunking is enabled
    text_splitter = None
    if enable_chunking:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    # Map file extensions to loaders
    loaders = {
        '.pdf': PyPDFLoader,
        '.txt': TextLoader,
        '.docx': Docx2txtLoader,
        '*': UnstructuredFileLoader
    }
    
    processed_docs = []
    for file_path in file_paths:
        try:
            # Get appropriate loader
            ext = Path(file_path).suffix.lower()
            loader_cls = loaders.get(ext, loaders['*'])
            
            # Load document
            loader = loader_cls(file_path)
            docs = loader.load()
            
            # Add file info to metadata
            file_metadata = metadata.copy() if metadata else {}
            file_metadata.update({
                'source_file': file_path,
                'file_type': ext,
                'file_name': Path(file_path).name
            })
            
            # Add metadata to documents
            for doc in docs:
                doc.metadata.update(file_metadata)
            
            if enable_chunking:
                # Split into chunks
                chunks = text_splitter.split_documents(docs)
                
                # Convert to RAG format
                for i, chunk in enumerate(chunks):
                    chunk_metadata = chunk.metadata.copy()
                    chunk_metadata.update({
                        'chunk_index': i,
                        'total_chunks': len(chunks)
                    })
                    
                    processed_docs.append({
                        'content': chunk.page_content,
                        'metadata': chunk_metadata
                    })
            else:
                # Keep documents as is
                for doc in docs:
                    processed_docs.append({
                        'content': doc.page_content,
                        'metadata': doc.metadata
                    })
                
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
            continue
    
    return processed_docs

In [None]:
def process_directory(
    dir_path: str, 
    chunk_size: int = 500,
    chunk_overlap: int = 50,
    enable_chunking: bool = True,
    metadata: Optional[Dict] = None, 
    recursive: bool = True
) -> List[Dict[str, Any]]:
    """Process all documents in a directory
    
    Args:
        dir_path: Path to directory
        chunk_size: Maximum number of characters per chunk
        chunk_overlap: Number of characters to overlap between chunks
        enable_chunking: Whether to split documents into chunks
        metadata: Optional metadata to add to all documents
        recursive: Whether to process subdirectories
        
    Returns:
        List of processed documents ready for RAG ingestion
    """
    # Get all files
    if recursive:
        file_paths = []
        for root, _, files in os.walk(dir_path):
            for file in files:
                if file.endswith(('.txt', '.pdf', '.docx')):  # Add more extensions as needed
                    file_paths.append(os.path.join(root, file))
    else:
        file_paths = [
            os.path.join(dir_path, f) for f in os.listdir(dir_path)
            if f.endswith(('.txt', '.pdf', '.docx'))
        ]
    
    return process_documents(
        file_paths, 
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        enable_chunking=enable_chunking,
        metadata=metadata
    )

In [None]:
def ingest_documents(
    source_path: str, 
    rag_system: Any, 
    chunk_size: int = 500,
    chunk_overlap: int = 50,
    enable_chunking: bool = True,
    metadata: Optional[Dict] = None, 
    batch_size: int = 100
):
    """Ingest documents from a file or directory into RAG system
    
    Args:
        source_path: Path to file or directory
        rag_system: Any RAG system with ingest_documents method
        chunk_size: Maximum number of characters per chunk
        chunk_overlap: Number of characters to overlap between chunks
        enable_chunking: Whether to split documents into chunks
        metadata: Optional metadata to add to all documents
        batch_size: Number of documents to process in each batch
    """
    # Process documents
    if os.path.isfile(source_path):
        documents = process_documents(
            [source_path], 
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            enable_chunking=enable_chunking,
            metadata=metadata
        )
    elif os.path.isdir(source_path):
        documents = process_directory(
            source_path, 
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            enable_chunking=enable_chunking,
            metadata=metadata
        )
    else:
        raise ValueError(f"Invalid source path: {source_path}")
    
    # Ingest in batches
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        rag_system._store_documents(batch, batch_size=batch_size)

In [None]:
# Example usage
def test_ingestion():
    """Test the document ingestion pipeline"""
    # Create test documents
    os.makedirs('test_docs', exist_ok=True)
    
    with open('test_docs/doc1.txt', 'w') as f:
        f.write("""
        Machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from data.
        Deep learning is a type of machine learning that uses neural networks with multiple layers.
        Reinforcement learning is another type of machine learning where agents learn by interacting with an environment.
        """)
    
    with open('test_docs/doc2.txt', 'w') as f:
        f.write("""
        Natural Language Processing (NLP) is a field of AI that focuses on interactions between computers and human language.
        Common NLP tasks include text classification, named entity recognition, and machine translation.
        """)
    
    # Import our RAG implementation
    from utils.notebook_utils.importable import notebook_to_module
    baseline_rag = notebook_to_module('implementation.ipynb')
    BaselineRAG = baseline_rag.BaselineRAG
    
    # Initialize RAG system
    rag = BaselineRAG(index_name="test-rag-documents")
    
    # Test ingestion
    print("Ingesting documents...")
    ingest_documents(
        'test_docs',
        rag,
        metadata={'dataset': 'test', 'domain': 'AI/ML'},
        batch_size=2
    )
    
    # Test query
    print("\nTesting query...")
    result = rag.query("What is machine learning and deep learning?")
    
    print("\nResponse:", result['response'])
    print("\nContext used:")
    for doc in result['context']:
        print(f"- {doc['content']}")
        print(f"  Metadata: {doc['metadata']}")
    
    # Cleanup
    import shutil
    shutil.rmtree('test_docs')

if __name__ == "__main__":
    test_ingestion()