# StackAI Vector Database Exploration

Interactive notebook for testing the StackAI vector database API.

**Setup:**
1. Start the server: `make start`
2. Seed test data: `python scripts/seed_data.py --library all`
3. Run the cells below

In [None]:
import httpx

BASE_URL = "http://localhost:8000/api/v1"
client = httpx.Client(base_url=BASE_URL, timeout=30.0)

# Helper functions

def search(query: str, library: str = "recipes_lib", k: int = 3):
    """Search a library and display formatted results."""
    response = client.post(f"/libraries/{library}/search", json={"query": query, "k": k})
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        return
    
    data = response.json()
    print(f'Query: "{data["query"]}"')
    print(f"Library: {library}")
    print(f"Results: {data['result_count']}")
    print("-" * 60)
    
    for i, result in enumerate(data["results"], 1):
        chunk = result["chunk"]
        print(f"\n[{i}] Score: {result['score']:.4f}")
        print(f"    Doc: {chunk['document_id']}")
        print(f"    Text: {chunk['text']}")


def list_libraries():
    """List all libraries."""
    response = client.get("/libraries")
    libraries = response.json()
    print(f"Libraries ({len(libraries)}):")
    for lib in libraries:
        print(f"  - {lib['id']}: {lib['name']}")


def list_documents(library: str):
    """List documents in a library."""
    response = client.get(f"/libraries/{library}/documents")
    if response.status_code != 200:
        print(f"Error: {response.text}")
        return
    docs = response.json()
    print(f"Documents in {library} ({len(docs)}):")
    for doc in docs:
        print(f"  - {doc['id']}: {doc['name']}")


def list_chunks(document: str, show_text: bool = True):
    """List chunks in a document."""
    response = client.get(f"/documents/{document}/chunks")
    if response.status_code != 200:
        print(f"Error: {response.text}")
        return
    chunks = response.json()
    print(f"Chunks in {document} ({len(chunks)}):")
    for chunk in chunks:
        if show_text:
            print(f"  [{chunk['id']}] {chunk['text'][:80]}..." if len(chunk['text']) > 80 else f"  [{chunk['id']}] {chunk['text']}")
        else:
            print(f"  - {chunk['id']}")


def health_check():
    """Check if the server is running."""
    try:
        response = httpx.get("http://localhost:8000/health")
        if response.status_code == 200:
            print("Server is running")
        else:
            print(f"Server returned: {response.status_code}")
    except httpx.ConnectError:
        print("Cannot connect to server. Start it with: make start")


print("Helper functions loaded: search(), list_libraries(), list_documents(), list_chunks(), health_check()")

## Quick Start

In [None]:
health_check()

In [None]:
list_libraries()

## Search

Available libraries (after seeding):
- `recipes_lib` - Cooking recipes
- `support_lib` - Support knowledge base
- `products_lib` - Product manuals

In [None]:
search("How do I make a creamy pasta sauce?", library="recipes_lib")

In [None]:
search("How do I reset my password?", library="support_lib")

In [None]:
search("bluetooth pairing", library="products_lib", k=5)

In [None]:
# Try your own query
search("chicken curry recipe", library="recipes_lib", k=5)

## Browse Data

In [None]:
list_documents("recipes_lib")

In [None]:
list_chunks("recipes_spaghetti_carbonara")

## Manual API Calls

For more control, use the client directly:

In [None]:
# Raw API call example
response = client.post("/libraries/recipes_lib/search", json={
    "query": "baking cookies",
    "k": 2
})
response.json()

## Testing Deletes

Test that deleting chunks/documents/libraries properly removes them from both storage and the search index.

In [None]:
# Create a test library for delete testing
test_lib = {"id": "delete_test_lib", "name": "Delete Test Library"}
response = client.post("/libraries", json=test_lib)
print(f"Create library: {response.status_code}")

# Create a document
test_doc = {"id": "delete_test_doc", "library_id": "delete_test_lib", "name": "Test Document"}
response = client.post("/libraries/delete_test_lib/documents", json=test_doc)
print(f"Create document: {response.status_code}")

# Create chunks
chunks = [
    {"id": "chunk_a", "document_id": "delete_test_doc", "text": "The quick brown fox jumps over the lazy dog"},
    {"id": "chunk_b", "document_id": "delete_test_doc", "text": "Machine learning is a subset of artificial intelligence"},
    {"id": "chunk_c", "document_id": "delete_test_doc", "text": "Python is a popular programming language"},
]
response = client.post("/documents/delete_test_doc/chunks/batch", json={"chunks": chunks})
print(f"Create chunks: {response.status_code} - Created {response.json().get('created_count', 0)} chunks")

In [None]:
# Verify search works - search for "programming"
print("=== BEFORE DELETE ===")
search("programming language", library="delete_test_lib", k=3)

In [None]:
# Delete single chunk (chunk_c - the Python one)
response = client.delete("/chunks/chunk_c")
print(f"Delete chunk_c: {response.status_code}")

# Verify it's gone from search results
print("\n=== AFTER DELETING chunk_c ===")
search("programming language", library="delete_test_lib", k=3)
# Should NOT find the Python chunk anymore

In [None]:
# Create another document with chunks to test document deletion
test_doc2 = {"id": "delete_test_doc2", "library_id": "delete_test_lib", "name": "Second Test Document"}
response = client.post("/libraries/delete_test_lib/documents", json=test_doc2)
print(f"Create document2: {response.status_code}")

chunks2 = [
    {"id": "chunk_d", "document_id": "delete_test_doc2", "text": "Cats are popular pets around the world"},
    {"id": "chunk_e", "document_id": "delete_test_doc2", "text": "Dogs are known as man's best friend"},
]
response = client.post("/documents/delete_test_doc2/chunks/batch", json={"chunks": chunks2})
print(f"Create chunks: {response.status_code}")

# Verify we can find pet content
print("\n=== BEFORE DOCUMENT DELETE ===")
search("pets and animals", library="delete_test_lib", k=5)

In [None]:
# Delete the document (should cascade delete its chunks from index too)
response = client.delete("/libraries/delete_test_lib/documents/delete_test_doc2")
print(f"Delete document2: {response.status_code}")

# Verify pet chunks are gone from search
print("\n=== AFTER DOCUMENT DELETE ===")
search("pets and animals", library="delete_test_lib", k=5)
# Should NOT find cats or dogs chunks anymore

In [None]:
# Test library deletion (deletes everything including index file)
response = client.delete("/libraries/delete_test_lib")
print(f"Delete library: {response.status_code}")

# Verify library is gone
response = client.get("/libraries/delete_test_lib")
print(f"Get deleted library: {response.status_code} (should be 404)")

# Verify search fails gracefully
response = client.post("/libraries/delete_test_lib/search", json={"query": "test", "k": 3})
print(f"Search deleted library: {response.status_code}")

print("\n✓ Delete tests complete!")

## Testing Index Persistence

Test that indexes survive server restarts.

**Manual test:**
1. Run the cell below to create test data
2. Restart the server (Ctrl+C, then `make start`)
3. Run the verification cell to confirm search still works

In [None]:
# Step 1: Create test data for persistence test
persist_lib = {"id": "persist_test_lib", "name": "Persistence Test Library"}
response = client.post("/libraries", json=persist_lib)
print(f"Create library: {response.status_code}")

persist_doc = {"id": "persist_test_doc", "library_id": "persist_test_lib", "name": "Persistence Test Doc"}
response = client.post("/libraries/persist_test_lib/documents", json=persist_doc)
print(f"Create document: {response.status_code}")

persist_chunks = [
    {"id": "persist_chunk_1", "document_id": "persist_test_doc", "text": "This chunk should survive a server restart"},
    {"id": "persist_chunk_2", "document_id": "persist_test_doc", "text": "Index persistence means the search index is saved to disk"},
]
response = client.post("/documents/persist_test_doc/chunks/batch", json={"chunks": persist_chunks})
print(f"Create chunks: {response.status_code}")

print("\n✓ Test data created. Now restart the server and run the next cell.")

In [None]:
# Step 2: Verify search still works after restart
# (Run this AFTER restarting the server)

print("Checking if search works after restart...")
search("server restart persistence", library="persist_test_lib", k=3)

# If results appear, persistence is working!
print("\n✓ If you see results above, index persistence is working!")

In [None]:
# Cleanup: Delete persistence test library
response = client.delete("/libraries/persist_test_lib")
print(f"Cleanup - Delete library: {response.status_code}")
print("✓ Test data cleaned up")