In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..')) # Vai para acordao_validator/
if module_path not in sys.path:
    sys.path.append(module_path)

### Notebook of unitary tests

In [4]:
# Cell 1: Setup and Test load_document with Real Data
import sys
import os
import logging

# Configure logging for tests
logging.basicConfig(level=logging.INFO, format='%(asctime)s - TEST - %(levelname)s - %(message)s')



# Import the function AFTER setting up the path
from src.data_loader import load_document
print("Successfully imported load_document.")


# --- Test Function Definition ---
def test_load_document_real_data(base_dir):
    """Tests the load_document function using real data files from the /data folder."""
    if load_document is None:
        print("Skipping test_load_document_real_data due to import error.")
        return

    print("\n" + "="*10 + " Running tests for load_document with REAL DATA " + "="*10)
    # Point directly to the main data directory
    data_dir = os.path.join(base_dir, "data")
    print(f"Using data directory: {data_dir}")

    # --- List of Real Files to Test ---
    # Using the filenames provided earlier
    real_files_to_test = [
        "Acórdão 733 de 2025 Plenário.pdf",
        "Acórdão 764 de 2025 Plenário.pdf",
        "Acórdão 733-2025 resumos.txt",
        "Acórdão 764-2025 resumos.txt",
    ]

    all_passed = True
    for filename in real_files_to_test:
        file_path = os.path.join(data_dir, filename)
        file_type = filename.split('.')[-1].upper() # Get file type (PDF/TXT)

        print(f"\n[Test Case] Testing {file_type}: {filename}")
        if not os.path.exists(file_path):
            print(f"-> {file_type} Test SKIPPED: File not found at {file_path}")
            all_passed = False
            continue

        # Call the function under test
        content = load_document(file_path)

        # --- Assertions ---
        if content is None:
            print(f"-> {file_type} Test FAILED: Returned None")
            all_passed = False
        elif not isinstance(content, str):
            print(f"-> {file_type} Test FAILED: Did not return a string (Type: {type(content)})")
            all_passed = False
        elif len(content.strip()) == 0:
            # Check if the file is genuinely empty or extraction failed
            if os.path.getsize(file_path) > 0:
                 print(f"-> {file_type} Test FAILED: Returned empty or whitespace-only string for a non-empty file.")
                 all_passed = False
                 if file_type == 'PDF':
                     print("   (PDF extraction might have failed. Check PyPDF2 compatibility with this PDF.)")
            else:
                 print(f"-> {file_type} Test PASSED (File is empty, returned empty string as expected)")

        else:
            print(f"-> {file_type} Test PASSED (Returned {len(content)} characters)")
            # print(f"Content preview (first 100 chars): {content[:100].replace(chr(10), ' ')}") # Uncomment to inspect

    # --- Test Non-existent File ---
    non_existent_path = os.path.join(data_dir, "does_not_exist_at_all.pdf")
    print(f"\n[Test Case] Testing non-existent file: {non_existent_path}")
    content_ne = load_document(non_existent_path)
    if content_ne is not None:
         print(f"-> Non-existent Test FAILED: Should return None, but returned: {type(content_ne)}")
         all_passed = False
    else:
        print("-> Non-existent file Test PASSED")

    # --- Test Unsupported Extension ---
    # Create a temporary unsupported file for this test
    unsupported_filename = "dummy_unsupported.xyz"
    unsupported_path = os.path.join(data_dir, unsupported_filename)
    try:
        with open(unsupported_path, "w") as f: f.write("test")
        print(f"\n[Test Case] Testing unsupported extension: {unsupported_path}")
        content_un = load_document(unsupported_path)
        if content_un is not None:
            print(f"-> Unsupported extension Test FAILED: Should return None, but returned: {type(content_un)}")
            all_passed = False
        else:
            print("-> Unsupported extension Test PASSED")
    finally:
        if os.path.exists(unsupported_path):
            os.remove(unsupported_path) # Clean up temporary file
            print(f"   (Cleaned up {unsupported_filename})")


    print("\n" + "="*10 + " All load_document tests completed " + "="*10)
    if all_passed:
        print("Result: ALL TESTS PASSED OR WERE SKIPPED APPROPRIATELY")
    else:
        print("Result: SOME TESTS FAILED")

# --- Call the Test Function ---
base_project_dir = os.path.abspath(os.path.join('..')) # Get the root project directory path
test_load_document_real_data(base_project_dir)


2025-04-30 10:56:17,311 - TEST - INFO - Reading PDF: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf with 44 pages.


Successfully imported load_document.

Using data directory: f:\interview\acordao\acordao_validator\data

[Test Case] Testing PDF: Acórdão 733 de 2025 Plenário.pdf


2025-04-30 10:56:18,184 - TEST - INFO - Reading PDF: f:\interview\acordao\acordao_validator\data\Acórdão 764 de 2025 Plenário.pdf with 9 pages.
2025-04-30 10:56:18,352 - TEST - ERROR - Error: File not found at 'f:\interview\acordao\acordao_validator\data\does_not_exist_at_all.pdf'


-> PDF Test PASSED (Returned 166256 characters)

[Test Case] Testing PDF: Acórdão 764 de 2025 Plenário.pdf
-> PDF Test PASSED (Returned 31615 characters)

[Test Case] Testing TXT: Acórdão 733-2025 resumos.txt
-> TXT Test PASSED (Returned 1516 characters)

[Test Case] Testing TXT: Acórdão 764-2025 resumos.txt
-> TXT Test PASSED (Returned 1468 characters)

[Test Case] Testing non-existent file: f:\interview\acordao\acordao_validator\data\does_not_exist_at_all.pdf
-> Non-existent file Test PASSED

[Test Case] Testing unsupported extension: f:\interview\acordao\acordao_validator\data\dummy_unsupported.xyz
-> Unsupported extension Test PASSED
   (Cleaned up dummy_unsupported.xyz)

Result: ALL TESTS PASSED OR WERE SKIPPED APPROPRIATELY


In [5]:
# Cell 2: Test split_text_into_paragraphs

# Import the specific function (ensure Cell 1 setup was run)
try:
    # Make sure Cell 1 (path setup and load_document import) was executed successfully
    from src.data_loader import split_text_into_paragraphs
    print("Successfully imported split_text_into_paragraphs.")
except ImportError:
    print("ERROR: Could not import split_text_into_paragraphs. Check Cell 1 setup and ensure no errors occurred.")
    split_text_into_paragraphs = None # Define as None to avoid errors in the test function

# --- Test Function Definition ---
def test_split_paragraphs():
    """Tests the split_text_into_paragraphs function."""
    if split_text_into_paragraphs is None:
        print("Skipping test_split_paragraphs due to import error.")
        return

    print("\n" + "="*10 + " Running tests for split_text_into_paragraphs " + "="*10)
    all_passed = True

    # --- Test Case 1: Basic paragraphs ---
    print("\n[Test Case 1] Basic paragraphs")
    text1 = "Paragraph one.\n\nParagraph two.\nHas two lines.\n\nParagraph three."
    expected1 = ["Paragraph one.", "Paragraph two.\nHas two lines.", "Paragraph three."]
    result1 = split_text_into_paragraphs(text1)
    if result1 != expected1:
        print(f"-> Basic Test FAILED:\nExpected: {expected1}\nReceived: {result1}")
        all_passed = False
    else:
        print("-> Basic Test PASSED")

    # --- Test Case 2: Extra newlines and whitespace ---
    # Includes leading/trailing spaces, multiple empty lines between paragraphs
    print("\n[Test Case 2] Extra newlines and whitespace")
    text2 = "\n\n  Leading/Trailing spaces paragraph.  \n\n\nParagraph after empty lines.\n\n   Another paragraph with spaces.   \n\nTrailing newline paragraph.\n\n"
    expected2 = ["Leading/Trailing spaces paragraph.", "Paragraph after empty lines.", "Another paragraph with spaces.", "Trailing newline paragraph."]
    result2 = split_text_into_paragraphs(text2)
    if result2 != expected2:
        print(f"-> Whitespace Test FAILED:\nExpected: {expected2}\nReceived: {result2}")
        all_passed = False
    else:
        print("-> Whitespace Test PASSED")

    # --- Test Case 3: Single paragraph (no double newlines) ---
    print("\n[Test Case 3] Single paragraph")
    text3 = "Just one block of text without double newlines.\nIt can contain single newlines."
    expected3 = ["Just one block of text without double newlines.\nIt can contain single newlines."]
    result3 = split_text_into_paragraphs(text3)
    if result3 != expected3:
        print(f"-> Single Para Test FAILED:\nExpected: {expected3}\nReceived: {result3}")
        all_passed = False
    else:
        print("-> Single Para Test PASSED")

    # --- Test Case 4: Empty input string ---
    print("\n[Test Case 4] Empty input string")
    text4 = ""
    expected4 = []
    result4 = split_text_into_paragraphs(text4)
    if result4 != expected4:
        print(f"-> Empty Input Test FAILED:\nExpected: {expected4}\nReceived: {result4}")
        all_passed = False
    else:
        print("-> Empty Input Test PASSED")

    # --- Test Case 5: None input ---
    print("\n[Test Case 5] None input")
    text5 = None
    expected5 = []
    result5 = split_text_into_paragraphs(text5)
    if result5 != expected5:
        print(f"-> None Input Test FAILED:\nExpected: {expected5}\nReceived: {result5}")
        all_passed = False
    else:
        print("-> None Input Test PASSED")

    # --- Test Case 6: Text with only whitespace/newlines ---
    # This should result in an empty list as all "paragraphs" become empty after stripping
    print("\n[Test Case 6] Whitespace/Newlines only")
    text6 = "\n   \n\n \t \n\n\n"
    expected6 = []
    result6 = split_text_into_paragraphs(text6)
    if result6 != expected6:
        print(f"-> Whitespace Only Test FAILED:\nExpected: {expected6}\nReceived: {result6}")
        all_passed = False
    else:
        print("-> Whitespace Only Test PASSED")

    print("\n" + "="*10 + " All split_text_into_paragraphs tests completed " + "="*10)
    if all_passed:
        print("Result: ALL PASSED")
    else:
        print("Result: SOME TESTS FAILED")

# --- Call the Test Function ---
test_split_paragraphs()

Successfully imported split_text_into_paragraphs.


[Test Case 1] Basic paragraphs
-> Basic Test PASSED

[Test Case 2] Extra newlines and whitespace
-> Whitespace Test PASSED

[Test Case 3] Single paragraph
-> Single Para Test PASSED

[Test Case 4] Empty input string
-> Empty Input Test PASSED

[Test Case 5] None input
-> None Input Test PASSED

[Test Case 6] Whitespace/Newlines only
-> Whitespace Only Test PASSED

Result: ALL PASSED


### 3. Integration test

In [6]:
# Cell 3: Test load_and_prepare_data

import os


from src.data_loader import load_and_prepare_data
print("Successfully imported load_and_prepare_data.")


# --- Test Function Definition ---
def test_load_prepare(base_dir):
    """Tests the load_and_prepare_data function with real data."""
    if load_and_prepare_data is None:
        print("Skipping test_load_prepare due to import error.")
        return

    print("\n" + "="*10 + " Running tests for load_and_prepare_data " + "="*10)
    data_dir = os.path.join(base_dir, "data")

    # --- Files to use for testing ---
    # Using one pair of real files
    acordao_file = "Acórdão 733 de 2025 Plenário.pdf"
    resumo_file = "Acórdão 733-2025 resumos.txt"
    acordao_path = os.path.join(data_dir, acordao_file)
    resumo_path = os.path.join(data_dir, resumo_file)

    all_passed = True

    # --- Test Case 1: Valid files ---
    print(f"\n[Test Case 1] Testing with valid files:\n  Acordão: {acordao_file}\n  Resumo: {resumo_file}")
    if not os.path.exists(acordao_path) or not os.path.exists(resumo_path):
         print(f"-> Test SKIPPED: One or both files not found.")
         print(f"  Acordao exists: {os.path.exists(acordao_path)}")
         print(f"  Resumo exists: {os.path.exists(resumo_path)}")
         all_passed = False
    else:
        acordao_chunks, resumo_claims = load_and_prepare_data(acordao_path, resumo_path)

        # Check if return types are correct
        if not isinstance(acordao_chunks, list) or not isinstance(resumo_claims, list):
            print(f"-> Valid Files Test FAILED: Did not return two lists. Returned: {type(acordao_chunks)}, {type(resumo_claims)}")
            all_passed = False
        else:
            print(f"  Returned {len(acordao_chunks)} acórdão chunks and {len(resumo_claims)} resumo claims.")

            # Check structure of the first chunk (if exists)
            if acordao_chunks:
                first_chunk = acordao_chunks[0]
                if not isinstance(first_chunk, dict) or "text" not in first_chunk or "metadata" not in first_chunk:
                    print(f"-> Valid Files Test FAILED: Acordão chunk structure incorrect. First chunk: {first_chunk}")
                    all_passed = False
                elif not isinstance(first_chunk["metadata"], dict) or "source" not in first_chunk["metadata"] or "chunk_index" not in first_chunk["metadata"]:
                    print(f"-> Valid Files Test FAILED: Acordão chunk metadata structure incorrect. Metadata: {first_chunk.get('metadata')}")
                    all_passed = False
                else:
                     print("  Acordão chunk structure OK.")
            else:
                 print("  Warning: No acórdão chunks were generated (check PDF extraction or file content).")

            # Check structure of the first claim (if exists)
            if resumo_claims:
                first_claim = resumo_claims[0]
                if not isinstance(first_claim, dict) or "text" not in first_claim or "metadata" not in first_claim:
                    print(f"-> Valid Files Test FAILED: Resumo claim structure incorrect. First claim: {first_claim}")
                    all_passed = False
                elif not isinstance(first_claim["metadata"], dict) or "source" not in first_claim["metadata"] or "claim_index" not in first_claim["metadata"]:
                     print(f"-> Valid Files Test FAILED: Resumo claim metadata structure incorrect. Metadata: {first_claim.get('metadata')}")
                     all_passed = False
                else:
                    print("  Resumo claim structure OK.")
            else:
                 print("  Warning: No resumo claims were generated (check file content).")

            if all_passed: # Only print pass if structure checks passed
                print("-> Valid Files Test PASSED (Structure check)")


    # --- Test Case 2: Non-existent Acordão file ---
    print(f"\n[Test Case 2] Testing with non-existent acórdão file")
    non_existent_acordao_path = os.path.join(data_dir, "non_existent_acordao.pdf")
    result_ac, result_re = load_and_prepare_data(non_existent_acordao_path, resumo_path)
    if result_ac is not None or result_re is not None:
        print(f"-> Non-existent Acordão Test FAILED: Should return (None, None), got ({type(result_ac)}, {type(result_re)})")
        all_passed = False
    else:
        print("-> Non-existent Acordão Test PASSED")


    # --- Test Case 3: Non-existent Resumo file ---
    print(f"\n[Test Case 3] Testing with non-existent resumo file")
    non_existent_resumo_path = os.path.join(data_dir, "non_existent_resumo.txt")
    result_ac, result_re = load_and_prepare_data(acordao_path, non_existent_resumo_path)
    if result_ac is not None or result_re is not None:
        print(f"-> Non-existent Resumo Test FAILED: Should return (None, None), got ({type(result_ac)}, {type(result_re)})")
        all_passed = False
    else:
        print("-> Non-existent Resumo Test PASSED")


    print("\n" + "="*10 + " All load_and_prepare_data tests completed " + "="*10)
    if all_passed:
        print("Result: ALL PASSED (or skipped appropriately)")
    else:
        print("Result: SOME TESTS FAILED")


# --- Call the Test Function ---
base_project_dir = os.path.abspath(os.path.join('..')) # Get the root project directory path
test_load_prepare(base_project_dir)


2025-04-30 10:56:38,135 - TEST - INFO - Starting data loading and preparation for:
2025-04-30 10:56:38,135 - TEST - INFO -   Acordão: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf
2025-04-30 10:56:38,136 - TEST - INFO -   Resumo: f:\interview\acordao\acordao_validator\data\Acórdão 733-2025 resumos.txt
2025-04-30 10:56:38,157 - TEST - INFO - Reading PDF: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf with 44 pages.


Successfully imported load_and_prepare_data.


[Test Case 1] Testing with valid files:
  Acordão: Acórdão 733 de 2025 Plenário.pdf
  Resumo: Acórdão 733-2025 resumos.txt


2025-04-30 10:56:39,083 - TEST - INFO - Processed 1 chunks from the acórdão.
2025-04-30 10:56:39,084 - TEST - INFO - Processed 3 claims from the resumo.
2025-04-30 10:56:39,084 - TEST - INFO - Starting data loading and preparation for:
2025-04-30 10:56:39,084 - TEST - INFO -   Acordão: f:\interview\acordao\acordao_validator\data\non_existent_acordao.pdf
2025-04-30 10:56:39,085 - TEST - INFO -   Resumo: f:\interview\acordao\acordao_validator\data\Acórdão 733-2025 resumos.txt
2025-04-30 10:56:39,086 - TEST - ERROR - Error: File not found at 'f:\interview\acordao\acordao_validator\data\non_existent_acordao.pdf'
2025-04-30 10:56:39,087 - TEST - ERROR - Failed to load the main document (acórdão) from: f:\interview\acordao\acordao_validator\data\non_existent_acordao.pdf
2025-04-30 10:56:39,087 - TEST - INFO - Starting data loading and preparation for:
2025-04-30 10:56:39,088 - TEST - INFO -   Acordão: f:\interview\acordao\acordao_validator\data\Acórdão 733 de 2025 Plenário.pdf
2025-04-30 10:

  Returned 1 acórdão chunks and 3 resumo claims.
  Acordão chunk structure OK.
  Resumo claim structure OK.
-> Valid Files Test PASSED (Structure check)

[Test Case 2] Testing with non-existent acórdão file
-> Non-existent Acordão Test PASSED

[Test Case 3] Testing with non-existent resumo file


2025-04-30 10:56:39,951 - TEST - ERROR - Error: File not found at 'f:\interview\acordao\acordao_validator\data\non_existent_resumo.txt'
2025-04-30 10:56:39,952 - TEST - ERROR - Failed to load the summary document (resumo) from: f:\interview\acordao\acordao_validator\data\non_existent_resumo.txt


-> Non-existent Resumo Test PASSED

Result: ALL PASSED (or skipped appropriately)
