# OpenSearch Filter Testing Notebook

This notebook tests the OpenSearch filter capabilities with real project and document IDs from the bronze-v2 dataset.

## 1. Setup and Imports

In [9]:
from contramate.models import OpenSearchFilter, DocumentFilter
from sample_data_ids import (
    get_sample_project_id,
    get_sample_document_id,
    print_sample_data
)
from contramate.services.opensearch_vector_search_service import OpenSearchVectorSearchServiceFactory
from pathlib import Path
from contramate.models import DocumentSource

## 2. Display Sample Data

Show the real project IDs and document IDs from bronze-v2 that we'll use for testing.

In [10]:
print_sample_data()

Sample Data from bronze-v2

Project 1:
  ID: 00149794-2432-4c18-b491-73d0fafd3efd
  Document 1: 577ff0a3-a032-5e23-bde3-0b6179e97949
  Composite: 00149794-2432-4c18-b491-73d0fafd3efd-577ff0a3-a032-5e23-bde3-0b6179e97949

Project 2:
  ID: 008a9fd2-9a4a-4c3f-ad5c-d33eca94af3b
  Document 1: aa1a0c65-8016-5d11-bbde-22055140660b
  Composite: 008a9fd2-9a4a-4c3f-ad5c-d33eca94af3b-aa1a0c65-8016-5d11-bbde-22055140660b

Project 3:
  ID: 0096b72f-1c0d-4724-924f-011f87d3591a
  Document 1: 16b6078b-248c-5ed9-83ef-20ee0af49396
  Composite: 0096b72f-1c0d-4724-924f-011f87d3591a-16b6078b-248c-5ed9-83ef-20ee0af49396

Project 4:
  ID: 00ab9a0d-4510-4833-bbdb-07abd9e49775
  Document 1: f8f43441-a1be-520b-87b7-14ca6f09b41d
  Composite: 00ab9a0d-4510-4833-bbdb-07abd9e49775-f8f43441-a1be-520b-87b7-14ca6f09b41d

Project 5:
  ID: 00b8501a-19e1-4004-a1ef-76636d796c79


## 3. Initialize OpenSearch Service

In [11]:
ENV_FILE_PATH = Path().absolute().parent / ".envs" / "local.env"
print(f"Using env file at: {ENV_FILE_PATH}")

Using env file at: /Users/datapsycho/PythonProjects/AgentEngBootCamp/contramate/.envs/local.env


In [12]:
# Initialize search service
search_service = OpenSearchVectorSearchServiceFactory.from_env_file(ENV_FILE_PATH)

# Verify connection
try:
    info = search_service.client.info()
    print(f"✓ Connected to OpenSearch cluster: {info['cluster_name']}")
    print(f"  Version: {info['version']['number']}")
except Exception as e:
    print(f"✗ Connection failed: {e}")

# Check index exists
index_name = search_service.index_name
if search_service.client.indices.exists(index=index_name):
    print(f"✓ Index '{index_name}' exists")
    
    # Get document count
    count = search_service.client.count(index=index_name)
    print(f"  Total documents: {count['count']:,}")
else:
    print(f"✗ Index '{index_name}' does not exist")

[32m2025-10-19 20:25:35.346[0m | [1mINFO    [0m | [36mcontramate.integrations.aws.opensearch[0m:[36mcreate_opensearch_client[0m:[36m52[0m - [1mCreated OpenSearch client for localhost:9200[0m
2025-10-19 20:25:35 - opensearch - INFO - GET http://localhost:9200/ [status:200 request:0.005s]
2025-10-19 20:25:35 - opensearch - INFO - HEAD http://localhost:9200/contracts-v1 [status:200 request:0.003s]
2025-10-19 20:25:35 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_count [status:200 request:0.004s]


✓ Connected to OpenSearch cluster: opensearch-cluster
  Version: 2.11.1
✓ Index 'contracts-v1' exists
  Total documents: 1,880


## 4. Test Filter Models

Create various filter configurations using real data from bronze-v2.

In [13]:
# Test 1: Single document filter
print("Test 1: Single Document Filter")
print("=" * 60)

project_id = get_sample_project_id(0)
doc_id = get_sample_document_id(project_id, 0)

single_doc_filter = OpenSearchFilter(
    documents=[
        DocumentFilter(project_id=project_id, reference_doc_id=doc_id)
    ]
)

print(f"Project ID: {project_id}")
print(f"Document ID: {doc_id}")
print(f"Composite ID: {project_id}-{doc_id}")
print(f"\nFilter Clauses:")
for clause in single_doc_filter.to_opensearch_filters():
    print(f"  {clause}")
print()

Test 1: Single Document Filter
Project ID: 00149794-2432-4c18-b491-73d0fafd3efd
Document ID: 577ff0a3-a032-5e23-bde3-0b6179e97949
Composite ID: 00149794-2432-4c18-b491-73d0fafd3efd-577ff0a3-a032-5e23-bde3-0b6179e97949

Filter Clauses:
  {'term': {'project_reference_doc_id': '00149794-2432-4c18-b491-73d0fafd3efd-577ff0a3-a032-5e23-bde3-0b6179e97949'}}



In [14]:
# Test 2: Multiple document filter
print("Test 2: Multiple Document Filter")
print("=" * 60)

multi_doc_filter = OpenSearchFilter(
    documents=[
        DocumentFilter(
            project_id=get_sample_project_id(0),
            reference_doc_id=get_sample_document_id(get_sample_project_id(0), 0)
        ),
        DocumentFilter(
            project_id=get_sample_project_id(1),
            reference_doc_id=get_sample_document_id(get_sample_project_id(1), 0)
        ),
        DocumentFilter(
            project_id=get_sample_project_id(2),
            reference_doc_id=get_sample_document_id(get_sample_project_id(2), 0)
        ),
    ]
)

print(f"Number of documents: 3")
print(f"\nFilter Clauses:")
for clause in multi_doc_filter.to_opensearch_filters():
    print(f"  {clause}")
print()

Test 2: Multiple Document Filter
Number of documents: 3

Filter Clauses:
  {'terms': {'project_reference_doc_id': ['00149794-2432-4c18-b491-73d0fafd3efd-577ff0a3-a032-5e23-bde3-0b6179e97949', '008a9fd2-9a4a-4c3f-ad5c-d33eca94af3b-aa1a0c65-8016-5d11-bbde-22055140660b', '0096b72f-1c0d-4724-924f-011f87d3591a-16b6078b-248c-5ed9-83ef-20ee0af49396']}}



In [15]:
# Test 3: Project ID filter
print("Test 3: Project ID Filter")
print("=" * 60)

project_filter = OpenSearchFilter(
    project_id=[get_sample_project_id(0), get_sample_project_id(1)]
)

print(f"Project IDs: {project_filter.project_id}")
print(f"\nFilter Clauses:")
for clause in project_filter.to_opensearch_filters():
    print(f"  {clause}")
print()

Test 3: Project ID Filter
Project IDs: ['00149794-2432-4c18-b491-73d0fafd3efd', '008a9fd2-9a4a-4c3f-ad5c-d33eca94af3b']

Filter Clauses:
  {'terms': {'project_id': ['00149794-2432-4c18-b491-73d0fafd3efd', '008a9fd2-9a4a-4c3f-ad5c-d33eca94af3b']}}



In [16]:
# Test 4: Document source filter
print("Test 4: Document Source Filter")
print("=" * 60)

source_filter = OpenSearchFilter(
    doc_source=DocumentSource.system
)

print(f"Document Source: {source_filter.doc_source}")
print(f"\nFilter Clauses:")
for clause in source_filter.to_opensearch_filters():
    print(f"  {clause}")
print()

Test 4: Document Source Filter
Document Source: system

Filter Clauses:
  {'term': {'content_source': 'system'}}



In [17]:
# Test 5: Combined filters
print("Test 5: Combined Filters")
print("=" * 60)

combined_filter = OpenSearchFilter(
    project_id=[get_sample_project_id(0)],
    doc_source=DocumentSource.system,
    contract_type=["NDA", "Service Agreement"]
)

print(f"Project IDs: {combined_filter.project_id}")
print(f"Document Source: {combined_filter.doc_source}")
print(f"Contract Types: {combined_filter.contract_type}")
print(f"\nFilter Clauses:")
for clause in combined_filter.to_opensearch_filters():
    print(f"  {clause}")
print()

Test 5: Combined Filters
Project IDs: ['00149794-2432-4c18-b491-73d0fafd3efd']
Document Source: system
Contract Types: ['NDA', 'Service Agreement']

Filter Clauses:
  {'term': {'content_source': 'system'}}
  {'terms': {'contract_type': ['NDA', 'Service Agreement']}}
  {'term': {'project_id': '00149794-2432-4c18-b491-73d0fafd3efd'}}



## 5. Test Search with Filters

Execute actual searches using the filter models with real data.

# Verify Embedding Model Configuration

In [19]:
# Check embedding model configuration
print("Embedding Client Configuration:")
print("=" * 60)
print(f"Model: {search_service.embedding_client.default_embedding_model}")
print(f"Expected dimensions: 1536 (text-embedding-3-small) or 3072 (text-embedding-3-large)")

# Test by generating a sample embedding
test_embedding_response = search_service.embedding_client.create_embeddings("test")
test_dimension = len(test_embedding_response.embeddings[0])
print(f"Actual dimensions: {test_dimension}")

if test_dimension == 1536:
    print("✓ Correct model (text-embedding-3-small)")
elif test_dimension == 3072:
    print("✗ Wrong model! Using text-embedding-3-large instead of text-embedding-3-small")
    print("  Please restart the notebook kernel and re-run all cells")
else:
    print(f"⚠ Unexpected dimension: {test_dimension}")

Embedding Client Configuration:
Model: text-embedding-3-small
Expected dimensions: 1536 (text-embedding-3-small) or 3072 (text-embedding-3-large)


2025-10-19 20:38:33 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Actual dimensions: 1536
✓ Correct model (text-embedding-3-small)


In [22]:
# Search 3: Hybrid search with combined filters
print("Search 3: Hybrid Search with Combined Filters")
print("=" * 60)

filter_dict = {
    "project_id": [get_sample_project_id(0)],
    "doc_source": "system",
    # "contract_type": ["NDA"]
}

try:
    result = search_service.hybrid_search(
        query="termination clause",
        filters=filter_dict,
        size=5
    )
    
    if result.is_ok():
        response = result.unwrap()
        print(f"Query: 'termination clause'")
        print(f"Filter: Project + Source + Contract Type")
        print(f"Results: {len(response.results)} documents found\n")
        
        for i, search_result in enumerate(response.results, 1):
            print(f"{i}. Score: {search_result.score:.4f}")
            print(f"   Project: {search_result.project_id[:8]}...")
            print(f"   Source: {search_result.content_source}")
            print(f"   Content: {search_result.content[:100]}...\n")
    else:
        print(f"Search failed: {result.err()}")
        
except Exception as e:
    print(f"Search failed: {e}")

print()

[32m2025-10-19 20:40:53.131[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36mhybrid_search[0m:[36m377[0m - [1m🔍 Performing hybrid search for: 'termination clause...'[0m


Search 3: Hybrid Search with Combined Filters


2025-10-19 20:40:53 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:40:53.443[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36mhybrid_search[0m:[36m383[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:40:53 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.106s]
[32m2025-10-19 20:40:53.553[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36mhybrid_search[0m:[36m475[0m - [1m✅ Hybrid search returned 5 results[0m


Query: 'termination clause'
Filter: Project + Source + Contract Type
Results: 5 documents found

1. Score: 5.5104
   Project: 00149794...
   Source: system
   Content: Context: Document Preamble

24.  Security and Disaster Recovery

Context: Document Preamble > 24. Se...

2. Score: 4.9936
   Project: 00149794...
   Source: system
   Content: Context: Document Preamble > 13. Permitted Users, Pricing and Subscription Information

13.1. The Pu...

3. Score: 4.8747
   Project: 00149794...
   Source: system
   Content: EXHIBIT 10.1

ELECTRONIC JOURNAL SOFT WARE DEVELOPMENT,

HOSTING AND MANAGEMENT AGREEMENT

This AGRE...

4. Score: 4.0762
   Project: 00149794...
   Source: system
   Content: Context: Document Preamble > 33. Rights Upon Termination > 33.1. Commencing upon any notice of termi...

5. Score: 1.4221
   Project: 00149794...
   Source: system
   Content: Context: Document Preamble

2.  The Modify Profile page will allow them to go to a change password

...




## 6. Performance Comparison

Compare search performance with and without filters.

In [23]:
import time

query = "intellectual property rights"
iterations = 3

print("Performance Comparison")
print("=" * 60)
print(f"Query: '{query}'")
print(f"Iterations: {iterations}\n")

# Test 1: No filters
print("Test 1: No Filters")
times = []
for i in range(iterations):
    start = time.time()
    result = search_service.semantic_search(query=query, k=10)
    elapsed = time.time() - start
    times.append(elapsed)
    
    if result.is_ok():
        response = result.unwrap()
        print(f"  Run {i+1}: {elapsed:.3f}s ({len(response.results)} results)")
    else:
        print(f"  Run {i+1}: Failed - {result.err()}")

avg_no_filter = sum(times) / len(times)
print(f"  Average: {avg_no_filter:.3f}s\n")

# Test 2: With project filter
print("Test 2: With Project Filter")
filter_dict = {"project_id": [get_sample_project_id(0)]}
times = []
for i in range(iterations):
    start = time.time()
    result = search_service.semantic_search(query=query, filters=filter_dict, k=10)
    elapsed = time.time() - start
    times.append(elapsed)
    
    if result.is_ok():
        response = result.unwrap()
        print(f"  Run {i+1}: {elapsed:.3f}s ({len(response.results)} results)")
    else:
        print(f"  Run {i+1}: Failed - {result.err()}")

avg_with_filter = sum(times) / len(times)
print(f"  Average: {avg_with_filter:.3f}s\n")

# Test 3: With multiple filters
print("Test 3: With Multiple Filters")
filter_dict = {
    "project_id": [get_sample_project_id(0), get_sample_project_id(1)],
    "doc_source": "system"
}
times = []
for i in range(iterations):
    start = time.time()
    result = search_service.semantic_search(query=query, filters=filter_dict, k=10)
    elapsed = time.time() - start
    times.append(elapsed)
    
    if result.is_ok():
        response = result.unwrap()
        print(f"  Run {i+1}: {elapsed:.3f}s ({len(response.results)} results)")
    else:
        print(f"  Run {i+1}: Failed - {result.err()}")

avg_multi_filter = sum(times) / len(times)
print(f"  Average: {avg_multi_filter:.3f}s\n")

print("Summary:")
print(f"  No filter: {avg_no_filter:.3f}s")
print(f"  Single filter: {avg_with_filter:.3f}s ({((avg_with_filter/avg_no_filter - 1) * 100):+.1f}%)")
print(f"  Multiple filters: {avg_multi_filter:.3f}s ({((avg_multi_filter/avg_no_filter - 1) * 100):+.1f}%)")

[32m2025-10-19 20:41:05.239[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


Performance Comparison
Query: 'intellectual property rights'
Iterations: 3

Test 1: No Filters


2025-10-19 20:41:06 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:06.785[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:06 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.040s]
[32m2025-10-19 20:41:06.829[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 10 results above threshold 0.5[0m
[32m2025-10-19 20:41:06.830[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 1: 1.590s (10 results)


2025-10-19 20:41:07 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:07.254[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:07 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.038s]
[32m2025-10-19 20:41:07.298[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 10 results above threshold 0.5[0m
[32m2025-10-19 20:41:07.299[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 2: 0.469s (10 results)


2025-10-19 20:41:07 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:07.694[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:07 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.032s]
[32m2025-10-19 20:41:07.729[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 10 results above threshold 0.5[0m
[32m2025-10-19 20:41:07.730[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 3: 0.431s (10 results)
  Average: 0.830s

Test 2: With Project Filter


2025-10-19 20:41:07 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:08.003[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:08 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.024s]
[32m2025-10-19 20:41:08.043[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 0 results above threshold 0.5[0m
[32m2025-10-19 20:41:08.043[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 1: 0.313s (0 results)


2025-10-19 20:41:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:08.277[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:08 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.011s]
[32m2025-10-19 20:41:08.290[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 0 results above threshold 0.5[0m
[32m2025-10-19 20:41:08.290[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m
2025-10-19 20:41:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 2

  Run 2: 0.247s (0 results)


[32m2025-10-19 20:41:08.492[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:08 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.013s]
[32m2025-10-19 20:41:08.507[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 0 results above threshold 0.5[0m
[32m2025-10-19 20:41:08.508[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 3: 0.217s (0 results)
  Average: 0.259s

Test 3: With Multiple Filters


2025-10-19 20:41:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:08.711[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:08 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.017s]
[32m2025-10-19 20:41:08.731[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 0 results above threshold 0.5[0m
[32m2025-10-19 20:41:08.732[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 1: 0.224s (0 results)


2025-10-19 20:41:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:08.943[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:08 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.012s]
[32m2025-10-19 20:41:08.957[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 0 results above threshold 0.5[0m
[32m2025-10-19 20:41:08.958[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m175[0m - [1m🔍 Generating embedding for query: 'intellectual property rights...'[0m


  Run 2: 0.226s (0 results)


2025-10-19 20:41:09 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[32m2025-10-19 20:41:09.229[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m181[0m - [1m✅ Generated embedding vector with 1536 dimensions[0m
2025-10-19 20:41:09 - opensearch - INFO - POST http://localhost:9200/contracts-v1/_search [status:200 request:0.010s]
[32m2025-10-19 20:41:09.241[0m | [1mINFO    [0m | [36mcontramate.services.opensearch_vector_search_service[0m:[36msemantic_search[0m:[36m230[0m - [1m✅ Semantic search returned 0 results above threshold 0.5[0m


  Run 3: 0.284s (0 results)
  Average: 0.245s

Summary:
  No filter: 0.830s
  Single filter: 0.259s (-68.8%)
  Multiple filters: 0.245s (-70.5%)


## 7. Filter Validation Tests

Test edge cases and error handling.

In [24]:
print("Filter Validation Tests")
print("=" * 60)

# Test 1: Empty filter
print("Test 1: Empty Filter")
empty_filter = OpenSearchFilter()
print(f"  Has filters: {empty_filter.has_filters()}")
print(f"  Filter clauses: {empty_filter.to_opensearch_filters()}")
print()

# Test 2: Invalid document source
print("Test 2: Invalid Document Source (should fail)")
try:
    invalid_filter = OpenSearchFilter(doc_source="invalid_source")
    print("  ✗ Validation failed to catch invalid source")
except Exception as e:
    print(f"  ✓ Validation caught error: {type(e).__name__}")
print()

# Test 3: Composite ID format
print("Test 3: Composite ID Format")
project_id = get_sample_project_id(0)
doc_id = get_sample_document_id(project_id, 0)
doc_filter = DocumentFilter(project_id=project_id, reference_doc_id=doc_id)
print(f"  Project ID: {doc_filter.project_id}")
print(f"  Document ID: {doc_filter.reference_doc_id}")
print(f"  Composite ID: {doc_filter.project_reference_doc_id}")
print(f"  Format matches: {doc_filter.project_reference_doc_id == f'{project_id}-{doc_id}'}")
print()

# Test 4: Filter dict conversion
print("Test 4: Dict to Filter Conversion")
filter_dict = {
    "documents": [
        {"project_id": get_sample_project_id(0), "reference_doc_id": get_sample_document_id(get_sample_project_id(0), 0)}
    ],
    "project_id": [get_sample_project_id(1)],
    "doc_source": "system"
}
filter_obj = OpenSearchFilter(**filter_dict)
print(f"  Documents: {len(filter_obj.documents)} document(s)")
print(f"  Project IDs: {filter_obj.project_id}")
print(f"  Doc Source: {filter_obj.doc_source}")
print(f"  Has filters: {filter_obj.has_filters()}")
print()

print("✓ All validation tests completed")

Filter Validation Tests
Test 1: Empty Filter
  Has filters: False
  Filter clauses: [{'term': {'content_source': 'system'}}]

Test 2: Invalid Document Source (should fail)
  ✓ Validation caught error: ValidationError

Test 3: Composite ID Format
  Project ID: 00149794-2432-4c18-b491-73d0fafd3efd
  Document ID: 577ff0a3-a032-5e23-bde3-0b6179e97949
  Composite ID: 00149794-2432-4c18-b491-73d0fafd3efd-577ff0a3-a032-5e23-bde3-0b6179e97949
  Format matches: True

Test 4: Dict to Filter Conversion
  Documents: 1 document(s)
  Project IDs: ['008a9fd2-9a4a-4c3f-ad5c-d33eca94af3b']
  Doc Source: system
  Has filters: True

✓ All validation tests completed
