# Baseline RAG Implementation

This notebook implements a generic baseline RAG system that can be used with any source material. It uses:
- Cohere Embed English (cohere.embed-english-v3) for embeddings
- Claude 3.5 Sonnet for LLM responses
- Amazon OpenSearch for vector storage

## Features
- Generic document ingestion
- Vector similarity search
- Context-aware response generation
- Automatic retry with exponential backoff

## Configuration Options

### Document Processing
- chunk_size: Number of words per chunk (default: 500)
- chunk_overlap: Number of overlapping words between chunks (default: 50)
- enable_chunking: Whether to split documents into chunks (default: True)

### Vector Search
- k: Number of context documents to retrieve (default: 3)
- search_type: Type of vector search to use ('script' or 'knn', default: 'script')
- similarity_threshold: Minimum similarity score to include (default: None)

### OpenSearch
- index_settings: Custom index settings for performance tuning
- knn_params: Parameters for k-NN algorithm (e.g., ef_search)

### API Settings
- max_retries: Maximum number of retry attempts (default: 5)
- min_delay: Minimum delay between retries in seconds (default: 1)
- max_delay: Maximum delay between retries in seconds (default: 60)

## Usage
1. Initialize the RAG system with desired configuration
2. Ingest documents (text content with optional metadata)
3. Query the system with natural language questions

In [None]:
import os
import json
import uuid
import boto3
import time
import random
from pathlib import Path
from typing import List, Dict, Any, Optional, Literal
from opensearchpy import OpenSearch, RequestsHttpConnection, helpers
from requests_aws4auth import AWS4Auth
from tqdm import tqdm
from botocore.exceptions import ClientError

from utils.notebook_utils.importable import notebook_to_module

# Import ingestion functionality
ingestion = notebook_to_module(str(Path(__file__).parent / 'ingestion.ipynb'))
ingest_documents = ingestion.ingest_documents

In [None]:
class BaselineRAG:
    """Generic baseline RAG implementation with configurable parameters"""
    def __init__(
        self, 
        index_name: str = "rag-documents",
        # Document processing config
        chunk_size: int = 500,  # Default 500 words â‰ˆ 2000 chars
        chunk_overlap: int = 50,  # Default 50 words overlap
        enable_chunking: bool = True,
        # Vector search config
        search_type: Literal['script', 'knn'] = 'script',
        similarity_threshold: Optional[float] = None,
        # OpenSearch config
        index_settings: Optional[Dict] = None,
        knn_params: Optional[Dict] = None,
        # API config
        max_retries: int = 5,
        min_delay: float = 1.0,
        max_delay: float = 60.0
    ):
        # Initialize AWS services
        self.bedrock = boto3.client('bedrock-runtime')
        self.region = boto3.Session().region_name
        
        # OpenSearch configuration
        self.opensearch_host = os.getenv('OPENSEARCH_HOST')
        if not self.opensearch_host:
            raise ValueError("OPENSEARCH_HOST environment variable is required")
        
        credentials = boto3.Session().get_credentials()
        self.awsauth = AWS4Auth(
            credentials.access_key,
            credentials.secret_key,
            self.region,
            'es',
            session_token=credentials.token
        )
        
        self.opensearch = OpenSearch(
            hosts=[{'host': self.opensearch_host, 'port': 443}],
            http_auth=self.awsauth,
            use_ssl=True,
            verify_certs=True,
            connection_class=RequestsHttpConnection
        )
        
        # Model configuration
        self.embedding_model_id = "cohere.embed-english-v3"
        self.llm_model_id = "anthropic.claude-3-5-sonnet-20240620-v1:0"
        self.index_name = index_name
        
        # Document processing configuration
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.enable_chunking = enable_chunking
        
        # Vector search configuration
        self.search_type = search_type
        self.similarity_threshold = similarity_threshold
        
        # OpenSearch configuration
        self.index_settings = index_settings or {}
        self.knn_params = knn_params or {}
        
        # API configuration
        self.max_retries = max_retries
        self.min_delay = min_delay
        self.max_delay = max_delay
        
        # Ensure index exists
        self._create_index_if_not_exists()
    
    def _create_index_if_not_exists(self):
        """Create OpenSearch index with appropriate mapping and settings"""
        if not self.opensearch.indices.exists(self.index_name):
            # Default settings
            settings = {
                "number_of_shards": 1,
                "number_of_replicas": 0,
                "knn": {
                    "algo_param": {
                        "ef_search": 512  # Higher values = more accurate but slower
                    }
                }
            }
            
            # Update with custom settings
            settings.update(self.index_settings)
            
            # Default mapping
            mapping = {
                "properties": {
                    "content": {"type": "text"},
                    "metadata": {"type": "object"},
                    "embedding": {
                        "type": "knn_vector",
                        "dimension": 1024,  # Cohere embedding dimension
                        "method": {
                            "name": "hnsw",
                            "space_type": "cosinesimil",
                            "engine": "nmslib",
                            "parameters": {
                                "ef_construction": 512,
                                "m": 16
                            }
                        }
                    }
                }
            }
            
            # Update KNN parameters
            if self.knn_params:
                mapping["properties"]["embedding"]["method"]["parameters"].update(self.knn_params)
            
            self.opensearch.indices.create(
                index=self.index_name,
                body={
                    "settings": settings,
                    "mappings": mapping
                }
            )
    
    def _invoke_with_retry(self, model_id: str, body: Dict) -> Dict:
        """Invoke Bedrock model with exponential backoff retry
        
        Args:
            model_id: Bedrock model ID
            body: Request body
            
        Returns:
            Model response
            
        Raises:
            Exception: If max retries exceeded
        """
        last_exception = None
        for attempt in range(self.max_retries):
            try:
                response = self.bedrock.invoke_model(
                    modelId=model_id,
                    body=json.dumps(body)
                )
                return json.loads(response['body'].read())
                
            except ClientError as e:
                last_exception = e
                if e.response['Error']['Code'] == 'ThrottlingException':
                    if attempt == self.max_retries - 1:
                        raise
                    # Exponential backoff with jitter
                    delay = min(
                        self.max_delay,
                        self.min_delay * (2 ** attempt) + random.uniform(0, 1)
                    )
                    time.sleep(delay)
                else:
                    raise
                    
        raise last_exception
    
    def get_embeddings(self, text: str) -> List[float]:
        """Generate embeddings using Cohere model"""
        request_body = {
            "texts": [text],
            "input_type": "search_document"
        }
        
        try:
            response_body = self._invoke_with_retry(
                model_id=self.embedding_model_id,
                body=request_body
            )
            return response_body['embeddings'][0]
            
        except Exception as e:
            print(f"Error getting embeddings: {str(e)}")
            print(f"Text length: {len(text)} chars, {len(text.split())} words")
            raise
    
    def _store_documents(self, documents: List[Dict[str, Any]], batch_size: int = 100) -> None:
        """Store documents in OpenSearch with embeddings.
        
        Args:
            documents: List of dictionaries with 'content' and optional 'metadata'
            batch_size: Number of documents to process in each batch
        """
        actions = []
        
        for doc in tqdm(documents, desc="Processing documents"):
            if 'content' not in doc:
                raise ValueError("Each document must have 'content' field")
                
            # Generate embedding
            embedding = self.get_embeddings(doc['content'])
            
            # Prepare document for indexing
            action = {
                "_index": self.index_name,
                "_source": {
                    "content": doc['content'],
                    "metadata": doc.get('metadata', {}),
                    "embedding": embedding
                }
            }
            actions.append(action)
            
            # Bulk index when batch is full
            if len(actions) >= batch_size:
                helpers.bulk(self.opensearch, actions)
                actions = []
        
        # Index any remaining documents
        if actions:
            helpers.bulk(self.opensearch, actions)
    
    def ingest_documents(self, documents: List[Dict[str, Any]], batch_size: int = 100) -> None:
        """Ingest documents into vector store with optional chunking.
        
        Args:
            documents: List of dictionaries with 'content' and optional 'metadata'
            batch_size: Number of documents to process in each batch
        """
        # Use ingestion notebook's functionality with our configuration
        ingest_documents(
            documents, 
            self, 
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            enable_chunking=self.enable_chunking,
            batch_size=batch_size
        )
    
    def semantic_search(self, query: str, k: int = 3) -> List[Dict[str, Any]]:
        """Search for relevant documents using embeddings
        
        Args:
            query: Search query text
            k: Number of results to return
            
        Returns:
            List of documents with content and metadata
        """
        query_embedding = self.get_embeddings(query)
        
        if self.search_type == 'script':
            # Script-based cosine similarity search
            script_query = {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "lang": "painless",
                        "source": "double score = cosineSimilarity(params.query_vector, doc['embedding']); return score + 1.0;",
                        "params": {"query_vector": query_embedding}
                    }
                }
            }
            
            # Add minimum score if threshold is set
            if self.similarity_threshold is not None:
                script_query["script_score"]["min_score"] = self.similarity_threshold
            
            response = self.opensearch.search(
                index=self.index_name,
                body={
                    "size": k,
                    "query": script_query,
                    "_source": ["content", "metadata"]
                }
            )
            
        else:  # search_type == 'knn'
            # Pure k-NN search
            knn_query = {
                "knn": {
                    "embedding": {
                        "vector": query_embedding,
                        "k": k
                    }
                }
            }
            
            response = self.opensearch.search(
                index=self.index_name,
                body={
                    "query": knn_query,
                    "_source": ["content", "metadata"]
                }
            )
        
        return [hit['_source'] for hit in response['hits']['hits']]
    
    def generate_response(self, query: str, context: List[Dict[str, Any]]) -> str:
        """Generate response using Claude 3.5 Sonnet"""
        context_str = "\n\n".join([doc['content'] for doc in context])
        
        prompt = f"""You are a helpful AI assistant. Use the following context to answer the question. 
        If you cannot answer the question based on the context, say so.
        
        Context:
        {context_str}
        
        Question: {query}
        
        Answer:"""
        
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "messages": [
                {"role": "user", "content": prompt}
            ]
        }
        
        response_body = self._invoke_with_retry(
            model_id=self.llm_model_id,
            body=request_body
        )
        
        return response_body['content'][0]['text']
    
    def query(self, query: str, k: int = 3) -> Dict[str, Any]:
        """Complete RAG pipeline
        
        Args:
            query: Natural language question
            k: Number of context documents to retrieve
            
        Returns:
            Dictionary containing:
            - query: Original question
            - context: Retrieved relevant documents
            - response: Generated answer
        """
        # Get relevant documents
        context = self.semantic_search(query, k)
        
        # Generate response
        response = self.generate_response(query, context)
        
        return {
            "query": query,
            "context": context,
            "response": response
        }

In [None]:
# Example usage with different configurations
def test_rag_system():
    # Example 1: Default configuration
    print("Testing default configuration...")
    rag = BaselineRAG(index_name="test-rag-default")
    
    # Example 2: Custom chunking
    print("\nTesting custom chunking...")
    rag_custom_chunks = BaselineRAG(
        index_name="test-rag-custom-chunks",
        chunk_size=300,  # Smaller chunks
        chunk_overlap=100,  # More overlap
        enable_chunking=True
    )
    
    # Example 3: k-NN search with custom parameters
    print("\nTesting k-NN search...")
    rag_knn = BaselineRAG(
        index_name="test-rag-knn",
        search_type='knn',
        knn_params={
            "ef_construction": 1024,  # Higher = more accurate index
            "m": 32  # Higher = more connections per node
        }
    )
    
    # Sample documents
    documents = [
        {
            "content": "Machine learning is a subset of artificial intelligence that focuses on developing systems that can learn from data.",
            "metadata": {"source": "test", "topic": "ML"}
        },
        {
            "content": "Deep learning is a type of machine learning that uses neural networks with multiple layers.",
            "metadata": {"source": "test", "topic": "DL"}
        }
    ]
    
    # Test each configuration
    for name, rag_instance in [
        ("Default", rag),
        ("Custom Chunks", rag_custom_chunks),
        ("k-NN Search", rag_knn)
    ]:
        print(f"\nTesting {name} Configuration:")
        print("Ingesting documents...")
        rag_instance.ingest_documents(documents)
        
        print("Testing query...")
        result = rag_instance.query("What is machine learning?")
        
        print("Response:", result['response'])
        print("\nContext used:")
        for doc in result['context']:
            print(f"- {doc['content']}")
            print(f"  Metadata: {doc['metadata']}")

if __name__ == "__main__":
    test_rag_system()