# OpenSearch Query Testing

This notebook tests different OpenSearch query approaches to debug and fix the script compilation error in the baseline RAG implementation.

## Error Context
The original error occurs in semantic_search() when using script_score with cosineSimilarity:
```
RequestError(400, 'search_phase_execution_exception', 'compile error')
```

## Test Plan
1. Connect to existing OpenSearch domain
2. Test different script query variations
3. Analyze error responses
4. Document working solution

In [None]:
import os
import json
import sys
import boto3
from pathlib import Path
from typing import List, Dict, Any, Optional
from opensearchpy import OpenSearch, RequestsHttpConnection, helpers
from requests_aws4auth import AWS4Auth

# Add project root to path
project_root = Path("../..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import utilities
from utils.aws.opensearch_utils import OpenSearchManager

# Configuration from baseline RAG benchmark
OPENSEARCH_DOMAIN = "baseline-rag-benchmark-store"
DATASET_NAME = "OriginOfCovid19Dataset"
INDEX_NAME = f"{DATASET_NAME.lower()}-benchmark"

In [None]:
# Connect to existing OpenSearch domain
print("Connecting to OpenSearch...")
manager = OpenSearchManager(
    domain_name=OPENSEARCH_DOMAIN,
    cleanup_enabled=False,  # Don't cleanup since it's in use
    verbose=True  # For debugging
)

endpoint = manager.setup_domain()
os.environ['OPENSEARCH_HOST'] = endpoint

# Initialize OpenSearch client
credentials = boto3.Session().get_credentials()
region = boto3.Session().region_name
awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    'es',
    session_token=credentials.token
)

client = OpenSearch(
    hosts=[{'host': endpoint, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

In [None]:
def analyze_error(e: Exception) -> None:
    """Analyze OpenSearch error response
    
    Args:
        e: Exception from OpenSearch query
    """
    print(f"Error type: {type(e)}")
    print(f"Error message: {str(e)}")
    if hasattr(e, 'info'):
        print("\nDetailed error info:")
        print(json.dumps(e.info, indent=2))

def test_original_query(query_vector: List[float], k: int = 3) -> None:
    """Test the original failing query
    
    Args:
        query_vector: Query embedding vector
        k: Number of results to return
    """
    print("Testing original query...")
    try:
        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                    "params": {"query_vector": query_vector}
                }
            }
        }
        
        response = client.search(
            index=INDEX_NAME,
            body={
                "size": k,
                "query": script_query,
                "_source": ["content", "metadata"]
            }
        )
        print("Query successful!")
        return response
        
    except Exception as e:
        print("Original query failed:")
        analyze_error(e)

def test_alternative_script(query_vector: List[float], k: int = 3) -> None:
    """Test alternative script syntax
    
    Args:
        query_vector: Query embedding vector
        k: Number of results to return
    """
    print("\nTesting alternative script syntax...")
    try:
        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "lang": "painless",  # Explicitly specify language
                    "source": "double score = cosineSimilarity(params.query_vector, doc['embedding']); return score + 1.0;",
                    "params": {"query_vector": query_vector}
                }
            }
        }
        
        response = client.search(
            index=INDEX_NAME,
            body={
                "size": k,
                "query": script_query,
                "_source": ["content", "metadata"]
            }
        )
        print("Query successful!")
        return response
        
    except Exception as e:
        print("Alternative script failed:")
        analyze_error(e)

def test_knn_query(query_vector: List[float], k: int = 3) -> None:
    """Test raw k-NN query without script
    
    Args:
        query_vector: Query embedding vector
        k: Number of results to return
    """
    print("\nTesting k-NN query...")
    try:
        knn_query = {
            "knn": {
                "embedding": {
                    "vector": query_vector,
                    "k": k
                }
            }
        }
        
        response = client.search(
            index=INDEX_NAME,
            body={
                "query": knn_query,
                "_source": ["content", "metadata"]
            }
        )
        print("Query successful!")
        return response
        
    except Exception as e:
        print("k-NN query failed:")
        analyze_error(e)

In [None]:
# Get sample query vector from existing document
print("Getting sample document for testing...")
try:
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": 1,
            "query": {"match_all": {}},
            "_source": ["embedding"]
        }
    )
    
    if response['hits']['hits']:
        sample_vector = response['hits']['hits'][0]['_source']['embedding']
        print("Got sample vector for testing")
    else:
        print("No documents found in index")
        
except Exception as e:
    print("Error getting sample document:")
    analyze_error(e)

In [None]:
# Test all query approaches
print("Testing query approaches with sample vector...\n")

# 1. Original query
original_result = test_original_query(sample_vector)

# 2. Alternative script
alt_result = test_alternative_script(sample_vector)

# 3. k-NN query
knn_result = test_knn_query(sample_vector)

# Compare results if available
results = {
    'original': original_result,
    'alternative': alt_result,
    'knn': knn_result
}

print("\nResults comparison:")
for name, result in results.items():
    if result:
        print(f"\n{name.title()} query:")
        for hit in result['hits']['hits']:
            print(f"- Score: {hit['_score']}")
            print(f"  Content: {hit['_source'].get('content', '')[:100]}...")