In [2]:
"""
Test suite for normalize_string function.

Tests 20 edge cases covering:
- Basic normalization (case, whitespace, punctuation)
- Unicode handling (diacritics, special characters, non-Latin scripts)
- Hyphen and apostrophe variations
- Edge cases (empty, numeric, mixed content)
"""

import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path().resolve().parent.parent))

from implementation.misc.helpers import normalize_string


# Define 20 test cases: (input_string, expected_output, description)
TEST_CASES = [
    # Basic punctuation and case handling - HYPHENS NOW PRESERVED
    ("Spider-Man: Into the Spider-Verse", "spider-man into the spider-verse", "Hyphen preserved, colon becomes space"),
    ("Ocean's Eleven", "oceans eleven", "Standard apostrophe removal"),
    ("L.A. Confidential", "la confidential", "Periods removed (no space)"),
    ("Se7en", "se7en", "Numbers preserved within words"),
    
    # Unicode diacritics and accents
    ("Amélie", "amelie", "French accent (acute)"),
    ("Señor López", "senor lopez", "Spanish tilde and accent"),
    ("Björk's Greatest Hits", "bjorks greatest hits", "Nordic umlaut with apostrophe"),
    ("Crème brûlée", "creme brulee", "Multiple French diacritics"),
    ("Mötley Crüe", "motley crue", "Heavy metal umlauts"),
    
    # Various apostrophe and quote styles
    ("It's a Wonderful Life", "its a wonderful life", "Curly apostrophe (right single quote)"),
    ("Rock 'n' Roll", "rock n roll", "Multiple curly apostrophes"),
    ("\"Quoted Title\"", "quoted title", "Double quotes removed"),
    
    # Various hyphen and dash styles - regular hyphens preserved, others become space
    ("Spider–Man", "spider man", "En-dash becomes space"),
    ("Spider—Man", "spider man", "Em-dash becomes space"),
    ("Jean-Luc Picard", "jean-luc picard", "Regular hyphen preserved"),
    
    # Whitespace edge cases
    ("  Multiple   Spaces  ", "multiple spaces", "Multiple spaces collapsed and trimmed"),
    ("Tab\tSeparated\tWords", "tab separated words", "Tabs become single spaces"),
    ("Line\nBreak\rTest", "line break test", "Newlines and carriage returns"),
    
    # Empty and minimal inputs
    ("", "", "Empty string returns empty"),
    ("   ", "", "Whitespace-only returns empty"),
]


def run_tests():
    """Run all test cases and report results."""
    print("=" * 70)
    print("NORMALIZE_STRING TEST SUITE")
    print("=" * 70)
    print()
    
    passed = 0
    failed = 0
    
    for i, (input_str, expected, description) in enumerate(TEST_CASES, 1):
        result = normalize_string(input_str)
        success = result == expected
        
        if success:
            passed += 1
            status = "✓ PASS"
        else:
            failed += 1
            status = "✗ FAIL"
        
        # Display test result
        print(f"Test {i:2d}: {status}")
        print(f"         Description: {description}")
        print(f"         Input:    {repr(input_str)}")
        print(f"         Expected: {repr(expected)}")
        if not success:
            print(f"         Got:      {repr(result)}")
        print()
    
    # Summary
    print("=" * 70)
    print(f"RESULTS: {passed} passed, {failed} failed out of {len(TEST_CASES)} tests")
    print("=" * 70)
    
    return failed == 0


# Run the test suite
all_passed = run_tests()

NORMALIZE_STRING TEST SUITE

Test  1: ✓ PASS
         Description: Hyphen preserved, colon becomes space
         Input:    'Spider-Man: Into the Spider-Verse'
         Expected: 'spider-man into the spider-verse'

Test  2: ✓ PASS
         Description: Standard apostrophe removal
         Input:    "Ocean's Eleven"
         Expected: 'oceans eleven'

Test  3: ✓ PASS
         Description: Periods removed (no space)
         Input:    'L.A. Confidential'
         Expected: 'la confidential'

Test  4: ✓ PASS
         Description: Numbers preserved within words
         Input:    'Se7en'
         Expected: 'se7en'

Test  5: ✓ PASS
         Description: French accent (acute)
         Input:    'Amélie'
         Expected: 'amelie'

Test  6: ✓ PASS
         Description: Spanish tilde and accent
         Input:    'Señor López'
         Expected: 'senor lopez'

Test  7: ✓ PASS
         Description: Nordic umlaut with apostrophe
         Input:    "Björk's Greatest Hits"
         Expected: 'bjo