# Baseline RAG Implementation

This notebook implements a generic baseline RAG system that can be used with any source material. It uses:

- Cohere Embed English (cohere.embed-english-v3) for embeddings
- Claude 3.5 Sonnet for LLM responses
- Amazon OpenSearch for vector storage

## Features
- Generic document ingestion
- Vector similarity search
- Context-aware response generation
- Automatic retry with exponential backoff

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Vector Search
- k: Number of context documents to retrieve (default: 3)
- search_type: Type of vector search to use ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score to include (default: None)

### OpenSearch
- index_settings: Custom index settings for performance tuning
- knn_params: Parameters for k-NN algorithm (e.g., ef_search)

### API Settings
- max_retries: Maximum number of retry attempts (default: 5)
- min_delay: Minimum delay between retries in seconds (default: 1)
- max_delay: Maximum delay between retries in seconds (default: 60)

## Usage
1. Initialize the RAG system with desired configuration
2. Ingest documents (text content with optional metadata)
3. Query the system with natural language questions

In [None]:
import os
import json
import uuid
import boto3
import time
import random
from pathlib import Path
from typing import List, Dict, Any, Optional, Literal
from botocore.exceptions import ClientError

from utils.aws.opensearch import (
    OpenSearchConfig,
    VectorSearchConfig,
    OpenSearchManager,
    VectorStore
)
from utils.notebook_utils.importable import notebook_to_module

# Import ingestion functionality using relative path
ingestion = notebook_to_module('ingestion.ipynb')
ingest_documents = ingestion.ingest_documents

In [None]:
class BaselineRAG:
    """Generic baseline RAG implementation with configurable parameters"""
    
    def __init__(
        self, 
        index_name: str = "rag-documents",
        # Document processing config
        chunk_size: int = 500,  # Default 500 words ≈ 2000 chars
        chunk_overlap: int = 50,  # Default 50 words overlap
        enable_chunking: bool = True,
        # Vector search config
        search_type: Literal['script', 'knn'] = 'script',
        similarity_threshold: Optional[float] = None,
        # OpenSearch config
        index_settings: Optional[Dict] = None,
        knn_params: Optional[Dict] = None,
        # API config
        max_retries: int = 5,
        min_delay: float = 1.0,
        max_delay: float = 60.0
    ):
        # Initialize AWS services
        self.bedrock = boto3.client('bedrock-runtime')
        self.region = boto3.Session().region_name
        
        # Model configuration
        self.embedding_model_id = "cohere.embed-english-v3"
        self.llm_model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
        self.index_name = index_name
        
        # Document processing configuration
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.enable_chunking = enable_chunking
        
        # API configuration
        self.max_retries = max_retries
        self.min_delay = min_delay
        self.max_delay = max_delay
        
        # Initialize OpenSearch components
        # Use shorter domain name to stay under 28 char limit
        domain_suffix = index_name.split('-')[0][:10]  # Take first 10 chars of first segment
        opensearch_config = OpenSearchConfig(
            domain_name=f"blrag-{domain_suffix}",  # Short prefix + truncated name
            cleanup_enabled=True,
            verbose=True
        )
        
        vector_config = VectorSearchConfig(
            search_type=search_type,
            similarity_threshold=similarity_threshold,
            index_settings=index_settings,
            knn_params=knn_params,
            max_retries=max_retries,
            min_delay=min_delay,
            max_delay=max_delay
        )
        
        # Set up OpenSearch domain
        self.opensearch_manager = OpenSearchManager(opensearch_config)
        endpoint = self.opensearch_manager.setup_domain()
        
        # Initialize vector store
        self.vector_store = VectorStore(
            index_name=index_name,
            config=vector_config,
            client=self.opensearch_manager.client
        )
    
    def _invoke_with_retry(self, model_id: str, body: Dict) -> Dict:
        """Invoke Bedrock model with exponential backoff retry
        
        Args:
            model_id: Bedrock model ID
            body: Request body
            
        Returns:
            Model response
            
        Raises:
            Exception: If max retries exceeded
        """
        last_exception = None
        for attempt in range(self.max_retries):
            try:
                response = self.bedrock.invoke_model(
                    modelId=model_id,
                    body=json.dumps(body)
                )
                return json.loads(response['body'].read())
                
            except ClientError as e:
                last_exception = e
                if e.response['Error']['Code'] == 'ThrottlingException':
                    if attempt == self.max_retries - 1:
                        raise
                    # Exponential backoff with jitter
                    delay = min(
                        self.max_delay,
                        self.min_delay * (2 ** attempt) + random.uniform(0, 1)
                    )
                    time.sleep(delay)
                else:
                    raise
                    
        raise last_exception
    
    def get_embeddings(self, text: str) -> Optional[List[float]]:
        """Generate embeddings using Cohere model"""
        request_body = {
            "texts": [text],
            "input_type": "search_document"
        }
        
        try:
            response_body = self._invoke_with_retry(
                model_id=self.embedding_model_id,
                body=request_body
            )
            return response_body['embeddings'][0]
            
        except Exception as e:
            print(f"Error getting embeddings: {str(e)}")
            print(f"Text length: {len(text)} chars, {len(text.split())} words")
            return None
    
    def _store_documents(self, documents: List[Dict[str, Any]], batch_size: int = 100) -> None:
        """Store documents in OpenSearch with embeddings.
        
        Args:
            documents: List of dictionaries with 'content' and optional 'metadata'
            batch_size: Number of documents to process in each batch
        """
        # Process documents to add embeddings
        docs_with_vectors = []
        for doc in documents:
            if 'content' not in doc:
                print("Document missing 'content' field")
                continue
                
            # Generate embedding
            vector = self.get_embeddings(doc['content'])
            if vector is None:
                print(f"Failed to get embeddings for document {doc.get('id', 'unknown')}")
                continue
            
            docs_with_vectors.append({
                'content': doc['content'],
                'vector': vector,
                'metadata': doc.get('metadata', {})
            })
        
        if not docs_with_vectors:
            print("No valid documents to store")
            return
        
        # Store documents in batches
        self.vector_store.store_documents(docs_with_vectors, batch_size=batch_size)
    
    def semantic_search(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
        """Search for relevant documents using embeddings
        
        Args:
            query: Search query text
            k: Number of results to return
            
        Returns:
            List of documents with content and metadata
        """
        query_vector = self.get_embeddings(query)
        if query_vector is None:
            print("Failed to get embeddings for query")
            return []
            
        return self.vector_store.search(query_vector, k=k)
    
    def generate_response(self, query: str, context: List[Dict[str, Any]]) -> str:
        """Generate response using Claude 3.5 Sonnet"""
        context_str = "\n\n".join([doc['content'] for doc in context])
        
        prompt = f"""You are a helpful AI assistant. Use the following context to answer the question. 
        If you cannot answer the question based on the context, say so.
        
        Context:
        {context_str}
        
        Question: {query}
        
        Answer:"""
        
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "messages": [
                {"role": "user", "content": prompt}
            ]
        }
        
        response_body = self._invoke_with_retry(
            model_id=self.llm_model_id,
            body=request_body
        )
        
        return response_body['content'][0]['text']
    
    def query(self, query: str, k: int = 3) -> Dict[str, Any]:
        """Complete RAG pipeline
        
        Args:
            query: Natural language question
            k: Number of context documents to retrieve
            
        Returns:
            Dictionary containing:
            - query: Original question
            - context: Retrieved relevant documents
            - response: Generated answer
        """
        # Get relevant documents
        context = self.semantic_search(query, k)
        
        # Generate response
        response = self.generate_response(query, context)
        
        return {
            "query": query,
            "context": context,
            "response": response
        }
    
    def cleanup(self, delete_resources: bool = False):
        """Clean up resources."""
        try:
            self.vector_store.cleanup(delete_resources=delete_resources)
            if delete_resources:
                self.opensearch_manager.cleanup()
        except:
            pass  # Best effort cleanup