In [1]:
# Cell 1: Setup, Path, and Imports
import sys
import os
import logging
import shutil # For cleaning up the test database directory
import time

# Configure logging for tests
logging.basicConfig(level=logging.INFO, format='%(asctime)s - RETRIEVER_TEST - %(levelname)s - %(message)s')

# Add the 'src' directory to the Python path to find the modules
module_path = os.path.abspath(os.path.join('..')) # Assumes notebook is in 'notebooks/' dir
if module_path not in sys.path:
    print(f"Adding {module_path} to sys.path")
    sys.path.append(module_path)
else:
    print(f"{module_path} already in sys.path")

print("Basic setup complete.")

Adding f:\interview\acordao\acordao_validator to sys.path
Basic setup complete.


In [2]:
# Cell 2: Imports, Constants, and Cleanup Helper

# Standard library imports already done in Cell 1

# --- Project Modules ---
try:
    # Need indexer to setup the DB for testing the retriever
    from src.indexer import get_embedding_model, create_or_update_index
    from src.indexer import CHROMA_PERSIST_DIR, CHROMA_COLLECTION_NAME, EMBEDDING_MODEL_NAME
    print("Successfully imported from indexer.")
except ImportError as e:
    print(f"ERROR importing from indexer: {e}. Indexer tests might be needed first.")
    # Define fallbacks to avoid NameErrors, though tests will likely fail
    get_embedding_model = None
    create_or_update_index = None
    CHROMA_PERSIST_DIR = os.path.join(".", "chroma_db_retriever_test_index") # Use separate test dir
    CHROMA_COLLECTION_NAME = "retriever_test_acordaos"
    EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large-instruct" # Keep consistent

try:
    # Need data_loader to get data for indexing
    from src.data_loader import load_and_prepare_data
    print("Successfully imported from data_loader.")
except ImportError as e:
    print(f"ERROR importing from data_loader: {e}")
    load_and_prepare_data = None

try:
    # The module we are testing
    from src.retriever import retrieve_relevant_chunks, _initialize_retriever
    print("Successfully imported from retriever.")
except ImportError as e:
    print(f"ERROR importing from retriever: {e}")
    retrieve_relevant_chunks = None
    _initialize_retriever = None # If needed for specific init tests

# --- Third-party ---
try:
    import chromadb
    print("Successfully imported chromadb.")
except ImportError as e:
    print(f"ERROR importing chromadb: {e}")
    chromadb = None


# --- Helper Function for Cleanup ---
# (Same as in indexer_tests)
def cleanup_chroma_test_db(persist_dir=CHROMA_PERSIST_DIR):
    """Removes the ChromaDB test directory if it exists."""
    abs_persist_dir = os.path.abspath(persist_dir) # Use absolute path
    if os.path.exists(abs_persist_dir):
        print(f"Cleaning up existing test ChromaDB directory: {abs_persist_dir}")
        try:
            # Attempt to clear client cache if possible (might help with file locks)
            # Note: This is speculative and depends on ChromaDB's internal state management
            if 'src.retriever' in sys.modules:
                 if hasattr(sys.modules['src.retriever'], '_chroma_client_instance') and \
                    sys.modules['src.retriever']._chroma_client_instance is not None:
                     print("Attempting to reset cached ChromaDB client...")
                     # chromadb doesn't have an explicit close(), reset might clear internal state
                     sys.modules['src.retriever']._chroma_client_instance.reset()
                     sys.modules['src.retriever']._chroma_client_instance = None
                     sys.modules['src.retriever']._chroma_collection_instance = None
                     print("Cached client reset.")
                 else:
                      print("No cached client found in retriever module to reset.")


            shutil.rmtree(abs_persist_dir)
            print("Cleanup successful.")
        except Exception as e:
            print(f"Error during cleanup: {e}. Manual deletion might be required.")
    else:
        print(f"Test ChromaDB directory not found (no cleanup needed): {abs_persist_dir}")

print(f"\nSetup complete. Using test ChromaDB directory: {os.path.abspath(CHROMA_PERSIST_DIR)}")
# Initial cleanup before tests start
cleanup_chroma_test_db(CHROMA_PERSIST_DIR)

  from .autonotebook import tqdm as notebook_tqdm


Successfully imported from indexer.
Successfully imported from data_loader.
Successfully imported from retriever.
Successfully imported chromadb.

Setup complete. Using test ChromaDB directory: f:\interview\acordao\acordao_validator\notebooks\chroma_db_index
Cleaning up existing test ChromaDB directory: f:\interview\acordao\acordao_validator\notebooks\chroma_db_index
No cached client found in retriever module to reset.
Cleanup successful.


In [3]:
# Cell 3: Pre-Test Indexing Setup
# We need to create an index first to test retrieval against it.

print("\n" + "="*10 + " Running Pre-Test Indexing Setup " + "="*10)

base_project_dir = os.path.abspath(os.path.join('..'))
data_dir = os.path.join(base_project_dir, "data")
persist_dir = os.path.abspath(CHROMA_PERSIST_DIR) # Use absolute path

# --- Files to use ---
# Using the same files as indexer_tests for consistency
acordao_file = "Acórdão 733 de 2025 Plenário.pdf"
resumo_file = "Acórdão 733-2025 resumos.txt" # Not used for indexing, but part of data loading
acordao_path = os.path.join(data_dir, acordao_file)
resumo_path = os.path.join(data_dir, resumo_file)

# --- Global variable for the model instance ---
# To pass it to the retriever tests if needed, and avoid reloading
embedding_model_instance_for_retriever_test = None
setup_successful = False

# --- Check dependencies ---
if None in [load_and_prepare_data, get_embedding_model, create_or_update_index]:
    print("-> SETUP SKIPPED: Missing necessary functions due to import errors in Cell 2.")
elif not os.path.exists(acordao_path):
     print(f"-> SETUP SKIPPED: Acordao file not found at {acordao_path}")
else:
    try:
        # --- Ensure clean start ---
        print("Attempting pre-setup cleanup...")
        cleanup_chroma_test_db(CHROMA_PERSIST_DIR) # Try cleanup before indexing

        # 1. Load Data
        print(f"\nLoading data using data_loader...")
        acordao_chunks, _ = load_and_prepare_data(acordao_path, resumo_path) # Only need chunks
        if acordao_chunks is None or not acordao_chunks:
            raise ValueError("Data loading failed or returned no chunks.")
        print(f"Loaded {len(acordao_chunks)} acórdão chunks.")

        # 2. Load Model
        print("\nLoading embedding model...")
        # Use the function from indexer.py
        embedding_model_instance_for_retriever_test = get_embedding_model()
        if embedding_model_instance_for_retriever_test is None:
             raise ValueError("Failed to load embedding model.")
        print(f"Model loaded: {embedding_model_instance_for_retriever_test}")

        # 3. Run Indexer
        print("\nRunning create_or_update_index...")
        start_time = time.time()
        # Use the function from indexer.py
        create_or_update_index(acordao_chunks, embedding_model_instance_for_retriever_test)
        index_time = time.time() - start_time
        print(f"Indexing function completed in {index_time:.2f} seconds.")

        # 4. Basic Verification (Check if DB exists and has items)
        print("\nVerifying index existence...")
        if not os.path.exists(persist_dir):
             raise FileNotFoundError(f"ChromaDB directory was not created at {persist_dir}")
        print(f"  ChromaDB directory found: {persist_dir}")
        verify_client = None # Initialize to ensure finally block works
        try:
             print(f"  Connecting verification client...")
             verify_client = chromadb.PersistentClient(path=persist_dir)
             print(f"  Getting collection '{CHROMA_COLLECTION_NAME}' for verification...")
             verify_collection = verify_client.get_collection(name=CHROMA_COLLECTION_NAME)
             count = verify_collection.count()
             if count != len(acordao_chunks):
                 raise ValueError(f"Index verification failed: Expected {len(acordao_chunks)} items, found {count}.")
             print(f"  Collection '{CHROMA_COLLECTION_NAME}' found with {count} items.")
             # --- REMOVED verify_client.reset() ---
             # print("  Verification client reset.") # Not needed and causes error
             setup_successful = True
        except Exception as verify_e:
             # Log the specific verification error
             logging.error(f"Error during index verification step: {verify_e}", exc_info=True)
             raise RuntimeError(f"Error during index verification: {verify_e}")
        finally:
             # Attempt to clean up the verification client instance
             # Setting to None might help release resources, though not guaranteed for file locks
             if verify_client is not None:
                  print("  Setting verification client instance to None.")
                  verify_client = None


        print("\n-> SETUP COMPLETED SUCCESSFULLY")

    except Exception as e:
        print(f"-> SETUP FAILED: {e}")
        # Cleanup if setup failed partially
        print("Attempting cleanup after setup failure...")
        cleanup_chroma_test_db(CHROMA_PERSIST_DIR)

print("\n" + "="*10 + " End of Pre-Test Indexing Setup " + "="*10)

2025-04-30 14:58:38,969 - RETRIEVER_TEST - INFO - Starting data loading and preparation for:
2025-04-30 14:58:38,970 - RETRIEVER_TEST - INFO -   Acordão: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf
2025-04-30 14:58:38,971 - RETRIEVER_TEST - INFO -   Resumo: f:\interview\acordao\acordao_validator\data\Acórdão 733-2025 resumos.txt
2025-04-30 14:58:38,988 - RETRIEVER_TEST - INFO - Reading PDF: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf with 44 pages.



Attempting pre-setup cleanup...
Test ChromaDB directory not found (no cleanup needed): f:\interview\acordao\acordao_validator\notebooks\chroma_db_index

Loading data using data_loader...


2025-04-30 14:58:39,720 - RETRIEVER_TEST - INFO - Processing Acordão as PDF file.
2025-04-30 14:58:39,721 - RETRIEVER_TEST - INFO - Processed 44 chunks from the acórdão.
2025-04-30 14:58:39,722 - RETRIEVER_TEST - INFO - Processed 3 claims from the resumo.
2025-04-30 14:58:39,811 - RETRIEVER_TEST - INFO - Loading embedding model 'intfloat/multilingual-e5-large-instruct' onto device: cuda
2025-04-30 14:58:39,815 - RETRIEVER_TEST - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large-instruct


Loaded 44 acórdão chunks.

Loading embedding model...


2025-04-30 14:58:45,884 - RETRIEVER_TEST - INFO - Embedding model 'intfloat/multilingual-e5-large-instruct' loaded successfully.
2025-04-30 14:58:45,885 - RETRIEVER_TEST - INFO - Starting index creation/update process for 44 chunks...
2025-04-30 14:58:45,885 - RETRIEVER_TEST - INFO - Initializing ChromaDB client at path: .\chroma_db_index
2025-04-30 14:58:45,924 - RETRIEVER_TEST - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-04-30 14:58:46,056 - RETRIEVER_TEST - INFO - Getting or creating collection: acordaos
2025-04-30 14:58:46,087 - RETRIEVER_TEST - INFO - Preparing data for indexing (IDs, prefixed text, metadata)...


Model loaded: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

Running create_or_update_index...


2025-04-30 14:58:46,088 - RETRIEVER_TEST - INFO - Generating embeddings for 44 chunks using model intfloat/multilingual-e5-large-instruct...
Batches: 100%|██████████| 2/2 [00:05<00:00,  2.99s/it]
2025-04-30 14:58:52,092 - RETRIEVER_TEST - INFO - Embedding generation complete.
2025-04-30 14:58:52,093 - RETRIEVER_TEST - INFO - Upserting 44 items into ChromaDB collection 'acordaos'...
2025-04-30 14:58:52,422 - RETRIEVER_TEST - INFO - Upsert completed successfully.
2025-04-30 14:58:52,424 - RETRIEVER_TEST - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Indexing function completed in 6.54 seconds.

Verifying index existence...
  ChromaDB directory found: f:\interview\acordao\acordao_validator\notebooks\chroma_db_index
  Connecting verification client...
  Getting collection 'acordaos' for verification...
  Collection 'acordaos' found with 44 items.
  Setting verification client instance to None.

-> SETUP COMPLETED SUCCESSFULLY



In [4]:
# Cell 4: Test retrieve_relevant_chunks Function

print("\n" + "="*10 + " Running Test: test_retrieval_logic " + "="*10)

def test_retrieval_logic(model, collection_name):
    """
    Tests the core logic of retrieve_relevant_chunks.
    Assumes the index was successfully created in Cell 3.
    """
    if retrieve_relevant_chunks is None:
        print("SKIPPING test_retrieval_logic: retrieve_relevant_chunks function not imported.")
        return

    print("\n--- Test Case 1: Valid Query, Default top_k=3 ---")
    # Use a claim from the Acórdão 733 resumos
    query1 = "O BNDES é uma estatal dependente da União"
    print(f"Query: \"{query1}\"")
    results1 = retrieve_relevant_chunks(
        query_claim=query1,
        top_k=3,
        model=model, # Pass the already loaded model
        collection=None # Let the function initialize the collection connection
    )

    # --- Assertions for Valid Query ---
    assert results1 is not None, "Retrieval failed, returned None"
    assert isinstance(results1, dict), f"Expected dict, got {type(results1)}"
    expected_keys = ['ids', 'embeddings', 'documents', 'metadatas', 'distances']
    # Note: embeddings might be None if not included in the query's 'include' list in retriever.py
    # Let's adjust based on retriever's include=['metadatas', 'documents', 'distances']
    expected_keys = ['ids', 'documents', 'metadatas', 'distances']
    for key in expected_keys:
        assert key in results1, f"Expected key '{key}' not in results"
        assert results1[key] is not None, f"Key '{key}' is None"
        assert isinstance(results1[key], list), f"Expected list for key '{key}', got {type(results1[key])}"
        # ChromaDB returns lists of lists, e.g., [[id1, id2]], [[doc1, doc2]]
        assert len(results1[key]) == 1, f"Expected outer list of size 1 for key '{key}'"
        assert isinstance(results1[key][0], list), f"Expected inner list for key '{key}'"

    num_retrieved1 = len(results1['ids'][0])
    print(f"Retrieved {num_retrieved1} results.")
    # Check if number of results is <= top_k (might be fewer if collection has < k items)
    assert num_retrieved1 <= 3, f"Expected at most 3 results, got {num_retrieved1}"
    if num_retrieved1 > 0:
         print(f"  Result 1 Distance: {results1['distances'][0][0]:.4f}")
         print(f"  Result 1 Metadata: {results1['metadatas'][0][0]}")
         print(f"  Result 1 Document Preview: {results1['documents'][0][0][:100]}...")
    print("-> Test Case 1 PASSED")


    print("\n--- Test Case 2: Valid Query, Different top_k=5 ---")
    query2 = "restituísse os valores recebidos a título de PLR" # Another claim
    print(f"Query: \"{query2}\", top_k=5")
    results2 = retrieve_relevant_chunks(query_claim=query2, top_k=5, model=model, collection=None) # Use cached model

    assert results2 is not None, "Retrieval failed"
    assert isinstance(results2, dict), "Expected dict"
    num_retrieved2 = len(results2.get('ids', [[]])[0])
    print(f"Retrieved {num_retrieved2} results.")
    assert num_retrieved2 <= 5, f"Expected at most 5 results, got {num_retrieved2}"
    print("-> Test Case 2 PASSED")


    print("\n--- Test Case 3: Empty Query String ---")
    query3 = ""
    print(f"Query: \"{query3}\"")
    results3 = retrieve_relevant_chunks(query_claim=query3, top_k=3, model=model, collection=None)
    assert results3 is None, f"Expected None for empty query, got {type(results3)}"
    print("-> Test Case 3 PASSED (Returned None as expected)")

    print("\n--- Test Case 4: Query Unlikely to Match ---")
    # Use a query very different from the Acordão content
    query4 = "Constituição Federal de 1988 artigo quinto inciso primeiro sobre alienígenas"
    print(f"Query: \"{query4}\"")
    results4 = retrieve_relevant_chunks(query_claim=query4, top_k=3, model=model, collection=None)
    assert results4 is not None, "Retrieval should not fail for non-matching query, just return results"
    num_retrieved4 = len(results4.get('ids', [[]])[0])
    # It WILL retrieve the closest matches, even if they are poor matches (high distance)
    print(f"Retrieved {num_retrieved4} results (expected {min(3, 44)}).") # 44 is num chunks indexed
    assert num_retrieved4 <= 3
    if num_retrieved4 > 0:
        print(f"  Top result distance (likely high): {results4['distances'][0][0]:.4f}")
    print("-> Test Case 4 PASSED (Returned results, even if poor matches)")


# --- Run the Test ---
if not setup_successful:
     print("SKIPPING tests in Cell 4 because setup in Cell 3 failed.")
else:
    try:
        # Pass the model instance loaded during setup
        test_retrieval_logic(embedding_model_instance_for_retriever_test, CHROMA_COLLECTION_NAME)
        print("\nResult: ALL test_retrieval_logic tests PASSED")
    except AssertionError as e:
        print(f"\nResult: TEST FAILED - Assertion Error: {e}")
    except Exception as e:
         print(f"\nResult: TEST FAILED - Unexpected Error: {e}")


print("\n" + "="*10 + " End of Test: test_retrieval_logic " + "="*10)

2025-04-30 14:58:52,607 - RETRIEVER_TEST - INFO - Loading embedding model 'intfloat/multilingual-e5-large-instruct' onto device: cuda
2025-04-30 14:58:52,611 - RETRIEVER_TEST - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large-instruct




--- Test Case 1: Valid Query, Default top_k=3 ---
Query: "O BNDES é uma estatal dependente da União"


2025-04-30 14:58:58,451 - RETRIEVER_TEST - INFO - Embedding model 'intfloat/multilingual-e5-large-instruct' loaded successfully.
2025-04-30 14:58:58,452 - RETRIEVER_TEST - INFO - Initializing ChromaDB client at path: .\chroma_db_index
2025-04-30 14:58:58,479 - RETRIEVER_TEST - INFO - Getting collection: acordaos
2025-04-30 14:58:58,490 - RETRIEVER_TEST - INFO - Successfully connected to collection 'acordaos' with 44 items.
2025-04-30 14:58:58,493 - RETRIEVER_TEST - INFO - Generating embedding for query: 'O BNDES é uma estatal dependente da União...'
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.58it/s]
2025-04-30 14:58:58,570 - RETRIEVER_TEST - INFO - Querying collection 'acordaos' for top 3 results...
2025-04-30 14:58:58,605 - RETRIEVER_TEST - INFO - Retrieved 3 results.
2025-04-30 14:58:58,606 - RETRIEVER_TEST - INFO - Getting collection: acordaos
2025-04-30 14:58:58,609 - RETRIEVER_TEST - INFO - Successfully connected to collection 'acordaos' with 44 items.
2025-04-30 14:58:58,610 

Retrieved 3 results.
  Result 1 Distance: 0.1219
  Result 1 Metadata: {'chunk_index': 31, 'chunk_type': 'paragraph', 'source': 'f:\\interview\\acordao\\acordao_validator\\data\\Acórdão 733 de 2025 Plenário.pdf', 'page_number': 32}
  Result 1 Document Preview: TRIBUNAL DE CONTAS DA UNIÃO TC 004.980/2017-4 
 
5 
 
23. Pode-se afirmar, assim, que o fato de obte...
-> Test Case 1 PASSED

--- Test Case 2: Valid Query, Different top_k=5 ---
Query: "restituísse os valores recebidos a título de PLR", top_k=5


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.68it/s]
2025-04-30 14:58:58,649 - RETRIEVER_TEST - INFO - Querying collection 'acordaos' for top 5 results...
2025-04-30 14:58:58,654 - RETRIEVER_TEST - INFO - Retrieved 5 results.
2025-04-30 14:58:58,656 - RETRIEVER_TEST - INFO - Getting collection: acordaos
2025-04-30 14:58:58,660 - RETRIEVER_TEST - INFO - Successfully connected to collection 'acordaos' with 44 items.
2025-04-30 14:58:58,662 - RETRIEVER_TEST - INFO - Generating embedding for query: 'Constituição Federal de 1988 artigo quinto inciso primeiro sobre alienígenas...'


Retrieved 5 results.
-> Test Case 2 PASSED

--- Test Case 3: Empty Query String ---
Query: ""
-> Test Case 3 PASSED (Returned None as expected)

--- Test Case 4: Query Unlikely to Match ---
Query: "Constituição Federal de 1988 artigo quinto inciso primeiro sobre alienígenas"


Batches: 100%|██████████| 1/1 [00:00<00:00, 33.00it/s]
2025-04-30 14:58:58,702 - RETRIEVER_TEST - INFO - Querying collection 'acordaos' for top 3 results...
2025-04-30 14:58:58,704 - RETRIEVER_TEST - INFO - Retrieved 3 results.


Retrieved 3 results (expected 3).
  Top result distance (likely high): 0.1679
-> Test Case 4 PASSED (Returned results, even if poor matches)

Result: ALL test_retrieval_logic tests PASSED

