In [1]:
import sys
import os
import logging
import shutil # For cleaning up the test database directory
import time

# Configure logging for tests
logging.basicConfig(level=logging.INFO, format='%(asctime)s - TEST - %(levelname)s - %(message)s')

# Add the 'src' directory to the Python path to find the modules
module_path = os.path.abspath(os.path.join('..')) # Assumes notebook is in 'notebooks/' dir
if module_path not in sys.path:
    print(f"Adding {module_path} to sys.path")
    sys.path.append(module_path)
else:
    print(f"{module_path} already in sys.path")

Adding f:\interview\acordao\acordao_validator to sys.path


In [2]:

# Import functions from our modules
try:
    from src.data_loader import load_and_prepare_data
    print("Successfully imported from data_loader.")
except ImportError as e:
    print(f"ERROR importing from data_loader: {e}")
    load_and_prepare_data = None

try:
    from src.indexer import get_embedding_model, create_or_update_index
    # Also import constants used by the indexer if needed for verification
    from src.indexer import CHROMA_PERSIST_DIR, CHROMA_COLLECTION_NAME, EMBEDDING_MODEL_NAME
    print("Successfully imported from indexer.")
except ImportError as e:
    print(f"ERROR importing from indexer: {e}")
    get_embedding_model = None
    create_or_update_index = None
    # Define defaults if import fails to avoid errors later
    CHROMA_PERSIST_DIR = os.path.join("..", "chroma_db_test_index") # Use a separate test dir
    CHROMA_COLLECTION_NAME = "test_acordaos"
    EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large-instruct" # Keep consistent


# --- Helper Function for Cleanup ---
def cleanup_chroma_test_db(persist_dir=CHROMA_PERSIST_DIR):
    """Removes the ChromaDB test directory if it exists."""
    abs_persist_dir = os.path.abspath(persist_dir) # Use absolute path
    if os.path.exists(abs_persist_dir):
        print(f"Cleaning up existing test ChromaDB directory: {abs_persist_dir}")
        try:
            shutil.rmtree(abs_persist_dir)
            print("Cleanup successful.")
        except Exception as e:
            print(f"Error during cleanup: {e}. Manual deletion might be required.")
    else:
        print(f"Test ChromaDB directory not found (no cleanup needed): {abs_persist_dir}")

print(f"\nSetup complete. Using test ChromaDB directory: {os.path.abspath(CHROMA_PERSIST_DIR)}")
# Initial cleanup before tests start
cleanup_chroma_test_db(CHROMA_PERSIST_DIR)

Successfully imported from data_loader.


  from .autonotebook import tqdm as notebook_tqdm


Successfully imported from indexer.

Setup complete. Using test ChromaDB directory: f:\interview\acordao\acordao_validator\notebooks\chroma_db_index
Cleaning up existing test ChromaDB directory: f:\interview\acordao\acordao_validator\notebooks\chroma_db_index
Cleanup successful.


In [3]:
# Cell 2: Test Embedding Model Loading
try:
    from src.indexer import get_embedding_model, create_or_update_index
    # Also import constants used by the indexer if needed for verification
    from src.indexer import CHROMA_PERSIST_DIR, CHROMA_COLLECTION_NAME, EMBEDDING_MODEL_NAME
    print("Successfully imported from indexer.")
except ImportError as e:
    print(f"ERROR importing from indexer: {e}")
    get_embedding_model = None # This is why the check `if get_embedding_model is None:` exists in the test
    # ... other fallback definitions ...

from sentence_transformers import SentenceTransformer
import torch
import time # Make sure time is imported if it wasn't already in this cell scope

def test_model_loading():
    """Tests the get_embedding_model function."""
    if get_embedding_model is None:
        print("Skipping test_model_loading due to import error.")
        return

    print("\n" + "="*10 + " Running test: test_model_loading " + "="*10)
    start_time = time.time()
    model = None
    try:
        # Use the model name defined in the indexer constants
        print(f"Attempting to load model: {EMBEDDING_MODEL_NAME}")
        model = get_embedding_model() # Uses the default defined in indexer
        load_time = time.time() - start_time
        print(f"Model loaded in {load_time:.2f} seconds.")

        # Assertions
        assert model is not None, "Model loading returned None"
        assert isinstance(model, SentenceTransformer), f"Expected SentenceTransformer, got {type(model)}"
        print(f"Model class: {type(model)}")
        # Check the device (requires torch)
        expected_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # Note: model.device might be a torch.device object
        # Check the device type instead of the exact string 'cuda:0'
        assert model.device.type == expected_device, f"Model loaded on unexpected device type: {model.device.type} (Expected: {expected_device})"
        print(f"Model loaded successfully on device: {model.device}")
        print("-> test_model_loading PASSED")
        return model # Return model for use in next test

    except Exception as e:
        load_time = time.time() - start_time
        print(f"-> test_model_loading FAILED after {load_time:.2f} seconds: {e}")
        # Re-raise if you want the notebook execution to stop on failure
        # raise e
        return None # Return None if failed


# --- Run the Test ---
# This might take a while the first time it downloads the model
embedding_model_instance = test_model_loading()


2025-04-30 14:41:51,626 - TEST - INFO - Loading embedding model 'intfloat/multilingual-e5-large-instruct' onto device: cuda
2025-04-30 14:41:51,629 - TEST - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-large-instruct


Successfully imported from indexer.

Attempting to load model: intfloat/multilingual-e5-large-instruct


2025-04-30 14:42:03,404 - TEST - INFO - Embedding model 'intfloat/multilingual-e5-large-instruct' loaded successfully.


Model loaded in 11.90 seconds.
Model class: <class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
Model loaded successfully on device: cuda:0
-> test_model_loading PASSED


In [4]:
# Cell 3: Test Index Creation/Update

import chromadb # Need this for verification step

def test_indexing(base_dir, model):
    """Tests the create_or_update_index function."""
    if create_or_update_index is None or load_and_prepare_data is None:
        print("Skipping test_indexing due to import errors in Cell 1.")
        return
    if model is None:
        print("Skipping test_indexing because embedding model failed to load in Cell 2.")
        return

    print("\n" + "="*10 + " Running test: test_indexing " + "="*10)
    data_dir = os.path.join(base_dir, "data")
    persist_dir = os.path.abspath(CHROMA_PERSIST_DIR) # Use absolute path

    # --- Test Files ---
    acordao_file = "Acórdão 733 de 2025 Plenário.pdf" # Use a real file
    resumo_file = "Acórdão 733-2025 resumos.txt"
    acordao_path = os.path.join(data_dir, acordao_file)
    resumo_path = os.path.join(data_dir, resumo_file)

    all_passed = True

    # --- Setup: Load data ---
    print(f"\nLoading data using data_loader...")
    acordao_chunks, _ = load_and_prepare_data(acordao_path, resumo_path)
    if acordao_chunks is None:
        print("-> Setup FAILED: data_loader failed to load data.")
        return # Cannot proceed

    num_chunks_loaded = len(acordao_chunks)
    print(f"Loaded {num_chunks_loaded} acórdão chunks.")
    if num_chunks_loaded == 0:
        print("Warning: No chunks loaded, indexing test will be trivial.")

    # --- Execution: Run the indexer ---
    print(f"\nRunning create_or_update_index...")
    start_time = time.time()
    try:
        create_or_update_index(acordao_chunks, model)
        index_time = time.time() - start_time
        print(f"Indexing function completed in {index_time:.2f} seconds.")
    except Exception as e:
        index_time = time.time() - start_time
        print(f"-> Indexing FAILED after {index_time:.2f} seconds: {e}")
        # Optional: Re-raise if needed
        # raise e
        all_passed = False
        # Attempt verification anyway, maybe the directory was created
        pass

    # --- Verification ---
    print(f"\nVerifying results...")
    # 1. Check if persist directory exists
    if not os.path.exists(persist_dir):
        print(f"-> Verification FAILED: ChromaDB directory was not created at {persist_dir}")
        all_passed = False
    else:
        print(f"  ChromaDB directory found: {persist_dir}")

        # 2. Try to connect and check collection count
        try:
            print(f"  Attempting to connect to persistent client at {persist_dir}...")
            verify_client = chromadb.PersistentClient(path=persist_dir)
            print(f"  Attempting to get collection: {CHROMA_COLLECTION_NAME}")
            verify_collection = verify_client.get_collection(name=CHROMA_COLLECTION_NAME)
            collection_count = verify_collection.count()
            print(f"  Collection count: {collection_count}")

            # Assert count matches number of chunks loaded
            if collection_count != num_chunks_loaded:
                 print(f"-> Verification FAILED: Collection count ({collection_count}) does not match loaded chunks ({num_chunks_loaded})")
                 all_passed = False
            else:
                 print("  Collection count matches loaded chunks.")

                 # 3. (Optional) Peek at one item
                 if collection_count > 0:
                    print("  Peeking at one item...")
                    peek_result = verify_collection.peek(limit=1)
                    # Check if necessary keys are present in the peek result
                    if not peek_result or not peek_result.get('ids') or not peek_result.get('documents') or not peek_result.get('metadatas'):
                         print(f"-> Verification FAILED: Peek result structure is incorrect: {peek_result}")
                         all_passed = False
                    else:
                         print(f"  Peek successful. ID: {peek_result['ids'][0]}, Metadata: {peek_result['metadatas'][0]}")

        except Exception as e:
            print(f"-> Verification FAILED: Error during ChromaDB verification: {e}")
            all_passed = False

    print("\n" + "="*10 + " End of test: test_indexing " + "="*10)
    if all_passed:
        print("Result: TEST PASSED")
    else:
        print("Result: TEST FAILED")

# --- Run the Test ---
base_project_dir = os.path.abspath(os.path.join('..'))
# Run cleanup before the test
cleanup_chroma_test_db(CHROMA_PERSIST_DIR)
# Pass the loaded model from Cell 2
test_indexing(base_project_dir, embedding_model_instance)
# Optional: Run cleanup again after the test
# cleanup_chroma_test_db(CHROMA_PERSIST_DIR)

2025-04-30 14:42:03,464 - TEST - INFO - Starting data loading and preparation for:
2025-04-30 14:42:03,466 - TEST - INFO -   Acordão: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf
2025-04-30 14:42:03,467 - TEST - INFO -   Resumo: f:\interview\acordao\acordao_validator\data\Acórdão 733-2025 resumos.txt
2025-04-30 14:42:03,497 - TEST - INFO - Reading PDF: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf with 44 pages.


Test ChromaDB directory not found (no cleanup needed): f:\interview\acordao\acordao_validator\notebooks\chroma_db_index


Loading data using data_loader...


2025-04-30 14:42:04,119 - TEST - INFO - Processing Acordão as PDF file.
2025-04-30 14:42:04,121 - TEST - INFO - Processed 44 chunks from the acórdão.
2025-04-30 14:42:04,121 - TEST - INFO - Processed 3 claims from the resumo.
2025-04-30 14:42:04,122 - TEST - INFO - Starting index creation/update process for 44 chunks...
2025-04-30 14:42:04,122 - TEST - INFO - Initializing ChromaDB client at path: .\chroma_db_index
2025-04-30 14:42:04,242 - TEST - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Loaded 44 acórdão chunks.

Running create_or_update_index...


2025-04-30 14:42:04,541 - TEST - INFO - Getting or creating collection: acordaos
2025-04-30 14:42:04,604 - TEST - INFO - Preparing data for indexing (IDs, prefixed text, metadata)...
2025-04-30 14:42:04,605 - TEST - INFO - Generating embeddings for 44 chunks using model intfloat/multilingual-e5-large-instruct...
Batches: 100%|██████████| 2/2 [00:06<00:00,  3.47s/it]
2025-04-30 14:42:11,586 - TEST - INFO - Embedding generation complete.
2025-04-30 14:42:11,587 - TEST - INFO - Upserting 44 items into ChromaDB collection 'acordaos'...
2025-04-30 14:42:11,921 - TEST - INFO - Upsert completed successfully.
2025-04-30 14:42:11,924 - TEST - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Indexing function completed in 7.80 seconds.

Verifying results...
  ChromaDB directory found: f:\interview\acordao\acordao_validator\notebooks\chroma_db_index
  Attempting to connect to persistent client at f:\interview\acordao\acordao_validator\notebooks\chroma_db_index...
  Attempting to get collection: acordaos
  Collection count: 44
  Collection count matches loaded chunks.
  Peeking at one item...
  Peek successful. ID: Acórdão 733 de 2025 Plenário.pdf_0, Metadata: {'chunk_index': 0, 'chunk_type': 'paragraph', 'page_number': 1, 'source': 'f:\\interview\\acordao\\acordao_validator\\data\\Acórdão 733 de 2025 Plenário.pdf'}

Result: TEST PASSED
