# Document Ingestion for Baseline RAG

This notebook handles document preprocessing and ingestion for the baseline RAG system. It can process any text-based source material and prepare it for RAG.

## Features
- Text extraction and cleaning
- Document chunking with configurable size and overlap
- Metadata preservation
- Batch processing for large document sets

## Usage
1. Load source material (text files, PDFs, etc.)
2. Process and chunk documents
3. Ingest into RAG system

In [None]:
import os
import json
from pathlib import Path
from typing import List, Dict, Any, Optional, Generator
from tqdm import tqdm

# Import our RAG implementation
from implementation import AWSConfig, BaselineRAG

In [None]:
class DocumentPreprocessor:
    """Handles document preprocessing for RAG ingestion"""
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """Initialize preprocessor
        
        Args:
            chunk_size: Maximum number of characters per chunk
            chunk_overlap: Number of characters to overlap between chunks
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text
        
        Args:
            text: Raw text content
            
        Returns:
            Cleaned text
        """
        # Remove excessive whitespace
        text = ' '.join(text.split())
        
        # TODO: Add more cleaning steps as needed
        return text
    
    def chunk_text(self, text: str) -> List[str]:
        """Split text into overlapping chunks
        
        Args:
            text: Text to chunk
            
        Returns:
            List of text chunks
        """
        chunks = []
        start = 0
        
        while start < len(text):
            # Find the end of the chunk
            end = start + self.chunk_size
            
            # If we're not at the end of the text, try to break at a sentence
            if end < len(text):
                # Look for sentence boundaries (.!?) within the last 100 chars of the chunk
                search_region = text[end-100:end]
                last_period = max(
                    search_region.rfind('. '),
                    search_region.rfind('! '),
                    search_region.rfind('? ')
                )
                
                if last_period != -1:
                    end = end - (100 - last_period - 2)  # -2 for the punctuation and space
            
            # Extract the chunk
            chunk = text[start:end].strip()
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
            
            # Move the start position, accounting for overlap
            start = end - self.chunk_overlap
        
        return chunks
    
    def process_document(self, content: str, metadata: Optional[Dict] = None) -> List[Dict[str, Any]]:
        """Process a single document
        
        Args:
            content: Document text content
            metadata: Optional metadata to preserve
            
        Returns:
            List of processed chunks with metadata
        """
        # Clean text
        cleaned_text = self.clean_text(content)
        
        # Split into chunks
        chunks = self.chunk_text(cleaned_text)
        
        # Prepare documents for ingestion
        documents = []
        for i, chunk in enumerate(chunks):
            # Create metadata for chunk
            chunk_metadata = metadata.copy() if metadata else {}
            chunk_metadata.update({
                'chunk_index': i,
                'total_chunks': len(chunks)
            })
            
            documents.append({
                'content': chunk,
                'metadata': chunk_metadata
            })
        
        return documents
    
    def process_text_file(self, file_path: str, metadata: Optional[Dict] = None) -> List[Dict[str, Any]]:
        """Process a text file
        
        Args:
            file_path: Path to text file
            metadata: Optional metadata to preserve
            
        Returns:
            List of processed chunks with metadata
        """
        # Add file info to metadata
        file_metadata = metadata.copy() if metadata else {}
        file_metadata.update({
            'source_file': file_path,
            'file_type': 'text'
        })
        
        # Read and process file
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        return self.process_document(content, file_metadata)
    
    def process_directory(self, dir_path: str, metadata: Optional[Dict] = None) -> Generator[Dict[str, Any], None, None]:
        """Process all text files in a directory
        
        Args:
            dir_path: Path to directory
            metadata: Optional metadata to preserve
            
        Yields:
            Processed chunks with metadata
        """
        for root, _, files in os.walk(dir_path):
            for file in files:
                if file.endswith('.txt'):  # TODO: Add support for more file types
                    file_path = os.path.join(root, file)
                    
                    try:
                        documents = self.process_text_file(file_path, metadata)
                        for doc in documents:
                            yield doc
                    except Exception as e:
                        print(f"Error processing {file_path}: {str(e)}")
                        continue

In [None]:
def ingest_documents(source_path: str, rag_system: BaselineRAG, metadata: Optional[Dict] = None, batch_size: int = 100):
    """Ingest documents from a file or directory into RAG system
    
    Args:
        source_path: Path to file or directory
        rag_system: Initialized RAG system
        metadata: Optional metadata to preserve
        batch_size: Number of documents to process in each batch
    """
    # Initialize preprocessor
    preprocessor = DocumentPreprocessor()
    
    # Process and ingest documents
    if os.path.isfile(source_path):
        # Process single file
        documents = preprocessor.process_text_file(source_path, metadata)
        rag_system.ingest_documents(documents, batch_size=batch_size)
    
    elif os.path.isdir(source_path):
        # Process directory
        batch = []
        
        for doc in preprocessor.process_directory(source_path, metadata):
            batch.append(doc)
            
            if len(batch) >= batch_size:
                rag_system.ingest_documents(batch, batch_size=batch_size)
                batch = []
        
        # Ingest any remaining documents
        if batch:
            rag_system.ingest_documents(batch, batch_size=batch_size)
    
    else:
        raise ValueError(f"Invalid source path: {source_path}")

In [None]:
# Example usage
def test_ingestion():
    # Create test documents
    os.makedirs('test_docs', exist_ok=True)
    
    with open('test_docs/doc1.txt', 'w') as f:
        f.write("""Machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from data.
        Deep learning is a type of machine learning that uses neural networks with multiple layers.
        Reinforcement learning is another type of machine learning where agents learn by interacting with an environment.""")
    
    with open('test_docs/doc2.txt', 'w') as f:
        f.write("""Natural Language Processing (NLP) is a field of AI that focuses on interactions between computers and human language.
        Common NLP tasks include text classification, named entity recognition, and machine translation.""")
    
    # Initialize RAG system
    config = AWSConfig()
    rag = BaselineRAG(config, index_name="test-rag-documents")
    
    # Test ingestion
    print("Ingesting documents...")
    ingest_documents(
        'test_docs',
        rag,
        metadata={'dataset': 'test', 'domain': 'AI/ML'},
        batch_size=2
    )
    
    # Test query
    print("\nTesting query...")
    result = rag.query("What is machine learning and deep learning?")
    
    print("\nResponse:", result['response'])
    print("\nContext used:")
    for doc in result['context']:
        print(f"- {doc['content']}")
        print(f"  Metadata: {doc['metadata']}")
    
    # Cleanup
    import shutil
    shutil.rmtree('test_docs')

if __name__ == "__main__":
    test_ingestion()