In [None]:
# Complete working example with fixed functions
import os
import re
import xml.etree.ElementTree as ET
import pandas as pd
from pathlib import Path
from typing import Set, List

def detect_file_type(file_path: str) -> str:
    """Detect if file is Excel or XLIFF based on extension"""
    extension = Path(file_path).suffix.lower()
    if extension in ['.xlsx', '.xls']:
        return 'excel'
    elif extension in ['.xliff', '.xlf', '.xml']:
        return 'xliff'
    else:
        raise ValueError(f"Unsupported file type: {extension}")

def tokenize_text(text: str) -> Set[str]:
    """
    Tokenize text by whitespace and punctuation, keeping hyphens and apostrophes
    Returns unique tokens without case normalization
    """
    if not text or pd.isna(text):
        return set()
    
    # Remove HTML tags and decode entities first
    cleaned_text = remove_html_tags(text)
    
    # Remove hyperlinks and email addresses
    cleaned_text = remove_hyperlinks_and_emails(cleaned_text)
    
    # Replace punctuation except hyphens (-) and apostrophes (') with spaces
    # Include º in the punctuation list to be replaced with spaces
    cleaned_text = re.sub(r'[^\w\s\'-]|º', ' ', cleaned_text)
    
    # Split by whitespace and filter out empty strings
    tokens = [token for token in cleaned_text.split() if token]
    
    # Clean token edges (remove leading/trailing apostrophes and hyphens)
    cleaned_tokens = []
    for token in tokens:
        cleaned_token = clean_token_edges(token)
        if cleaned_token:  # Only add non-empty tokens
            cleaned_tokens.append(cleaned_token)
    
    # Apply additional filtering
    token_set = set(cleaned_tokens)
    filtered_tokens = filter_tokens(token_set)
    
    return filtered_tokens

def has_square_brackets(text: str) -> bool:
    """Check if text contains content inside square brackets [...]"""
    if not text or pd.isna(text):
        return False
    return bool(re.search(r'\[.+\]', str(text)))

def remove_hyperlinks_and_emails(text: str) -> str:
    """Remove hyperlinks and email addresses from text"""
    if not text or pd.isna(text):
        return ""
    
    text = str(text)
    
    # Remove URLs (http, https, ftp, www)
    text = re.sub(r'https?://[^\s<>"{}|\\^`\[\]]+', '', text)
    text = re.sub(r'ftp://[^\s<>"{}|\\^`\[\]]+', '', text)
    text = re.sub(r'www\.[^\s<>"{}|\\^`\[\]]+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    return text

def remove_html_tags(text: str) -> str:
    """Remove HTML tags and decode HTML entities from text"""
    if not text or pd.isna(text):
        return ""
    
    import html
    text = str(text)
    
    # Decode HTML entities first (e.g., &lt; -> <, &quot; -> ")
    text = html.unescape(text)
    
    # Replace <br> and <p> tags with whitespace to prevent word concatenation
    # Handle both opening and closing tags, and self-closing tags
    text = re.sub(r'<br\s*/?>', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'</?p\b[^>]*>', ' ', text, flags=re.IGNORECASE)
    
    # Remove all other HTML tags using regex
    text = re.sub(r'<[^>]+>', '', text)
    
    return text

def is_all_caps_string(text: str) -> bool:
    """Check if the entire string is in all caps (excluding spaces and punctuation)"""
    if not text or pd.isna(text):
        return False
    
    # Extract only alphabetic characters
    letters_only = ''.join(c for c in str(text) if c.isalpha())
    
    # If no letters, return False; otherwise check if all are uppercase
    return len(letters_only) > 0 and letters_only.isupper()

def is_just_numbers(token: str) -> bool:
    """Check if token consists only of digits"""
    if not token:
        return False
    return token.isdigit()

def matches_time_pattern(token: str) -> bool:
    """Check if token matches pattern: digit(s) followed by PA, PM, or AL"""
    if not token:
        return False
    return bool(re.match(r'^\d+(PA|PM|AM|AL)$', token, re.IGNORECASE))

def matches_digit_word_pattern(token: str) -> bool:
    """Check if token matches pattern: digit(s)-word (e.g., 123-neutral)"""
    if not token:
        return False
    return bool(re.match(r'^\d+-\w+$', token))

def clean_token_edges(token: str) -> str:
    """Remove apostrophes and hyphens from the start and end of tokens"""
    if not token:
        return token
    
    # Remove apostrophes and hyphens from start and end repeatedly
    while token and token[0] in "'-":
        token = token[1:]
    while token and token[-1] in "'-":
        token = token[:-1]
    
    return token

def is_same_character_chain(token: str) -> bool:
    """Check if token is a chain of the same character (case-insensitive)"""
    if not token or len(token) < 2:
        return False
    
    # Convert to lowercase and check if all characters are the same
    lower_token = token.lower()
    return len(set(lower_token)) == 1

def filter_tokens(tokens: Set[str]) -> Set[str]:
    """Apply additional filtering to tokens"""
    filtered_tokens = set()
    
    for token in tokens:
        # Remove tokens with length < 3
        if len(token) < 3:
            continue
            
        # Remove tokens that are chains of the same character
        if is_same_character_chain(token):
            continue
        
        # Remove tokens that are just numbers
        if is_just_numbers(token):
            continue
            
        # Remove tokens matching time patterns (digit(s) + PA/PM/AL)
        if matches_time_pattern(token):
            continue
            
        # Remove tokens matching digit-word patterns (e.g., 123-neutral)
        if matches_digit_word_pattern(token):
            continue
            
        filtered_tokens.add(token)
    
    return filtered_tokens

def process_excel_file(file_path: str, language_code: str, ignore_identical_translation: bool = True) -> Set[str]:
    """Process Excel file and extract tokens from the specified language column"""
    try:
        df = pd.read_excel(file_path)
        
        if language_code not in df.columns:
            raise ValueError(f"Language code '{language_code}' not found in Excel columns: {list(df.columns)}")
        
        all_tokens = set()
        
        # Try to find the source language column (typically fr-fr)
        source_column = None
        for col in df.columns:
            if col.lower() == 'fr-fr':
                source_column = col
                break
        
        # Process all values in the language column
        for idx, value in enumerate(df[language_code]):
            if pd.notna(value):
                # Check if corresponding source contains square brackets
                skip_entry = False
                source_value = None
                
                if source_column and idx < len(df[source_column]):
                    source_value = df[source_column].iloc[idx]
                    if has_square_brackets(source_value):
                        skip_entry = True
                        print(f"Skipping entry with square brackets in source: {source_value}")
                
                # Check if target equals source
                if not skip_entry and ignore_identical_translation and source_value is not None:
                    if str(value).strip() == str(source_value).strip():
                        skip_entry = True
                        print(f"Skipping entry where target equals source: {value}")
                
                # Check if target text is all caps
                if not skip_entry and is_all_caps_string(str(value)):
                    skip_entry = True
                    print(f"Skipping entry with all-caps target: {value}")
                
                if not skip_entry:
                    tokens = tokenize_text(str(value))
                    all_tokens.update(tokens)
        
        return all_tokens
        
    except Exception as e:
        raise Exception(f"Error processing Excel file: {str(e)}")

def process_xliff_file(file_path: str, language_code: str, ignore_identical_translation: bool = True) -> Set[str]:
    """Process XLIFF file and extract tokens from source or target elements"""
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        all_tokens = set()
        
        # Find all file elements (handle both with and without namespace)
        file_elements = []
        for elem in root.iter():
            if elem.tag.endswith('}file') or elem.tag == 'file':
                file_elements.append(elem)
        
        for file_elem in file_elements:
            source_lang = file_elem.get('source-language', '').lower()
            target_lang = file_elem.get('target-language', '').lower()
            
            # Determine which elements to extract based on language code
            extract_source = source_lang == language_code.lower()
            extract_target = target_lang == language_code.lower()
            
            if not (extract_source or extract_target):
                continue
            
            # Find all trans-unit elements
            trans_units = []
            for elem in file_elem.iter():
                if elem.tag.endswith('}trans-unit') or elem.tag == 'trans-unit':
                    trans_units.append(elem)
            
            for trans_unit in trans_units:
                source_text = None
                target_text = None
                
                # First pass: collect source and target texts
                for elem in trans_unit:
                    if elem.tag.endswith('}source') or elem.tag == 'source':
                        source_text = elem.text
                    elif elem.tag.endswith('}target') or elem.tag == 'target':
                        target_text = elem.text
                
                # Check if source contains square brackets
                skip_entry = source_text and has_square_brackets(source_text)
                if skip_entry:
                    print(f"Skipping entry with square brackets in source: {source_text}")
                    continue
                
                # Check if target equals source
                if not skip_entry and ignore_identical_translation and source_text and target_text:
                    if source_text.strip() == target_text.strip():
                        skip_entry = True
                        print(f"Skipping entry where target equals source: {target_text}")
                        continue
                
                # Check if target text is all caps (only if we're extracting target)
                if not skip_entry and extract_target and target_text:
                    if is_all_caps_string(target_text):
                        skip_entry = True
                        print(f"Skipping entry with all-caps target: {target_text}")
                        continue
                
                # Process the appropriate text based on language
                if extract_source and source_text:
                    tokens = tokenize_text(source_text)
                    all_tokens.update(tokens)
                
                if extract_target and target_text:
                    tokens = tokenize_text(target_text)
                    all_tokens.update(tokens)
        
        return all_tokens
        
    except Exception as e:
        raise Exception(f"Error processing XLIFF file: {str(e)}")

def export_tokens_to_txt(tokens: Set[str], output_path: str):
    """Export tokens to a text file, one token per line"""
    sorted_tokens = sorted(tokens)  # Sort for consistent output
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for token in sorted_tokens:
            f.write(token + '\n')
    
    print(f"Exported {len(tokens)} unique tokens to: {output_path}")

def process_file(file_path: str, language_code: str, output_path: str = None, ignore_identical_translation: bool = True):
    """
    Main function to process a file and extract tokens for a given language code
    
    Args:
        file_path: Path to the Excel or XLIFF file
        language_code: Language code (e.g., "es-es")
        output_path: Optional output path for the txt file
        ignore_identical_translation: If True (default), skip entries where target equals source
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Detect file type
    file_type = detect_file_type(file_path)
    print(f"Detected file type: {file_type}")
    
    # Process file based on type
    if file_type == 'excel':
        tokens = process_excel_file(file_path, language_code, ignore_identical_translation)
    elif file_type == 'xliff':
        tokens = process_xliff_file(file_path, language_code, ignore_identical_translation)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")
    
    print(f"Found {len(tokens)} unique tokens for language: {language_code}")
    
    # Generate output path if not provided
    if output_path is None:
        base_name = Path(file_path).stem
        output_path = f"{base_name}_{language_code}_tokens.txt"
    
    # Export tokens
    export_tokens_to_txt(tokens, output_path)
    
    return tokens

# Working demonstration
sample_xliff_content = """<?xml version="1.0" encoding="UTF-8"?>
<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">
    <file datatype="plaintext" original="sample" source-language="fr-fr" target-language="es-es">
        <body>
            <trans-unit id="test.1" datatype="html">
                <source>Votre alignement est au sommet et vos ennemis n'existent (probablement) plus.</source>
                <target>Tu alineamiento está en la cumbre y (probablemente) tus enemigos ya no existen.</target>
            </trans-unit>
            <trans-unit id="test.2" datatype="html">
                <source>Apogée</source>
                <target>Apogeo</target>
            </trans-unit>
        </body>
    </file>
</xliff>"""

# Create sample file
with open("sample.xliff", "w", encoding="utf-8") as f:
    f.write(sample_xliff_content)

print("Sample XLIFF file created!")

# Process the sample file for Spanish (es-es)
try:
    tokens = process_file("sample.xliff", "es-es", "spanish_tokens.txt")
    print(f"\nExtracted tokens: {sorted(tokens)}")
    
    # Show the content of the output file
    with open("spanish_tokens.txt", "r", encoding="utf-8") as f:
        content = f.read()
    print(f"\nContent of spanish_tokens.txt:\n{content}")
    
except Exception as e:
    print(f"Error: {e}")

# Also test with French (fr-fr) to show source extraction
try:
    print("\n" + "="*50)
    print("Testing with French (source language):")
    tokens_fr = process_file("sample.xliff", "fr-fr", "french_tokens.txt")
    print(f"\nExtracted French tokens: {sorted(tokens_fr)}")
    
    # Show the content of the French output file
    with open("french_tokens.txt", "r", encoding="utf-8") as f:
        content_fr = f.read()
    print(f"\nContent of french_tokens.txt:\n{content_fr}")
    
except Exception as e:
    print(f"Error: {e}")

print("\nDemonstration completed!")

In [None]:
# Demonstration with Excel file
print("\n" + "="*50)
print("Testing with Excel file:")

# Create sample Excel data
sample_data = {
    'key': ['greeting', 'farewell', 'question'],
    'en-us': ['Hello world!', 'Goodbye everyone.', 'How are you?'],
    'es-es': ['¡Hola mundo!', 'Adiós a todos.', '¿Cómo estás?'],
    'fr-fr': ['Bonjour le monde!', 'Au revoir tout le monde.', 'Comment allez-vous?']
}

df = pd.DataFrame(sample_data)
df.to_excel("sample.xlsx", index=False)
print("Sample Excel file created!")
print(f"Excel columns: {list(df.columns)}")
print("Sample data:")
print(df.to_string(index=False))

# Process Excel file for Spanish
try:
    print(f"\nProcessing Excel for es-es:")
    tokens_excel_es = process_file("sample.xlsx", "es-es", "excel_spanish_tokens.txt", ignore_identical_translation=False)
    print(f"Extracted tokens: {sorted(tokens_excel_es)}")
    
    # Show content
    with open("excel_spanish_tokens.txt", "r", encoding="utf-8") as f:
        content = f.read()
    print(f"\nContent of excel_spanish_tokens.txt:\n{content}")
    
except Exception as e:
    print(f"Error: {e}")

# Process Excel file for English
try:
    print(f"\nProcessing Excel for en-us:")
    tokens_excel_en = process_file("sample.xlsx", "en-us", "excel_english_tokens.txt")
    print(f"Extracted tokens: {sorted(tokens_excel_en)}")
    
except Exception as e:
    print(f"Error: {e}")

# Clean up all files
print("\n" + "="*50)
print("Cleaning up files...")
files_to_remove = [
    "sample.xliff", "sample.xlsx", 
    "spanish_tokens.txt", "french_tokens.txt",
    "excel_spanish_tokens.txt", "excel_english_tokens.txt"
]

for file in files_to_remove:
    if os.path.exists(file):
        os.remove(file)
        print(f"Removed: {file}")

print("\nAll demonstrations completed successfully!")
print("\nSUMMARY:")
print("- The script can handle both Excel (.xlsx, .xls) and XLIFF (.xliff, .xlf, .xml) files")
print("- For Excel: extracts text from the column matching the language code")
print("- For XLIFF: extracts text from <source> or <target> elements based on source-language/target-language attributes")
print("- Tokenizes by whitespace and punctuation, preserving hyphens (-) and apostrophes (')")
print("- Exports unique tokens (case-sensitive) to a text file, one token per line")
print("- Usage: process_file(file_path, language_code, optional_output_path)")

# Final run

In [None]:
LANGFILE_PATH = r"C:\Users\Nelso\Downloads\2025-06-13_Retro_TB_as at 6 May 2024.xlsx" # Excel file path (terminology base)
LANGFILE_PATH = r"C:\Users\Nelso\Downloads\export.2025-05-06_14-03-35.fr-fr.es-es.xliff"  # Path to the sample XLIFF file
LANG_CODE = "es-es"
#EXPORT_PATH = r"C:\Users\Nelso\Downloads\spanishTB_tokens.txt"
EXPORT_PATH = r"C:\Users\Nelso\Downloads\spanish_tokens.txt"
# Process the sample file for Spanish (es-es)
try:
    tokens = process_file(LANGFILE_PATH, LANG_CODE, EXPORT_PATH, ignore_identical_translation=True)
    #print(f"\nExtracted tokens: {sorted(tokens)}")
    
    # Show the content of the output file
    #with open("spanish_tokens.txt", "r", encoding="utf-8") as f:
     #   content = f.read()
    #print(f"\nContent of spanish_tokens.txt:\n{content}")
    
except Exception as e:
    print(f"Error: {e}")

# Merge both token files

Output : single list merged from the TB list + TM list.
Purpose: Useful to avoid problematic non-translations in the TM (élément_FR, élément[WIP]_ES), and add the curated non-translation terms from the terminology base (Wabbit_FR = Wabbit_ES).

In [None]:
TXT_PATH1 = r"C:\Users\Nelso\Downloads\spanishTB_tokens.txt" #from TB
TXT_PATH2 = r"C:\Users\Nelso\Downloads\spanish_tokens.txt" #from TM
# Merge two text files into one with unique tokens
def merge_token_files(file1: str, file2: str, output_file: str):
    """Merge two token files into one, ensuring unique tokens"""
    if not os.path.exists(file1) or not os.path.exists(file2):
        raise FileNotFoundError("One or both token files do not exist.")
    
    tokens = set()
    
    # Read first file
    with open(file1, 'r', encoding='utf-8') as f:
        for line in f:
            tokens.add(line.strip())
    
    # Read second file
    with open(file2, 'r', encoding='utf-8') as f:
        for line in f:
            tokens.add(line.strip())
    
    # Write unique tokens to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for token in sorted(tokens):
            f.write(token + '\n')
    
    print(f"Merged {len(tokens)} unique tokens into: {output_file}")

# Merge the two token files
merge_token_files(TXT_PATH1, TXT_PATH2, r"C:\Users\Nelso\Downloads\merged_spanish_tokens.txt")

# Filter words appearing in a common language dictionary

## Filter function v1.0 (prefer v2.0)

In [9]:
# Remove tokens from a token list based on a common language dictionary (Hunspell)
import os  # Add missing import

# merged tokens file
PATH_Ankama_tokens = r"C:\Users\Nelso\Downloads\merged_spanish_tokens.txt"  # Path to the Ankama tokens file
PATH_Hunspell_tokens = r"C:\Users\Nelso\Downloads\es_dicts\es\es_ES.dic"  # Path to the Hunspell tokens file


def filter_tokens_by_dictionary(txt_file_path: str, dic_file_path: str, output_dic_path: str):
    """
    Read tokens from a txt file and a dic file, remove txt tokens that appear in dic,
    then export remaining txt tokens as a dic file with token count
    
    Args:
        txt_file_path: Path to the txt file with tokens (one per line)
        dic_file_path: Path to the dic file (first line is token count, rest are tokens)
        output_dic_path: Path where the filtered dic file will be saved
    """
    if not os.path.exists(txt_file_path):
        raise FileNotFoundError(f"Token file not found: {txt_file_path}")
    
    if not os.path.exists(dic_file_path):
        raise FileNotFoundError(f"Dictionary file not found: {dic_file_path}")
    
    # Read tokens from txt file
    print(f"Reading tokens from: {txt_file_path}")
    txt_tokens = []
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            token = line.strip()
            if token:  # Skip empty lines
                txt_tokens.append(token)
    
    print(f"Loaded {len(txt_tokens)} tokens from txt file")
    
    # Read dictionary file
    print(f"Reading dictionary from: {dic_file_path}")
    with open(dic_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    if not lines:
        raise ValueError("Dictionary file is empty")
    
    # First line is the token count (not language code)
    original_count = lines[0].strip()
    print(f"Dictionary token count: {original_count}")
    
    # Rest are dictionary tokens - clean Hunspell metadata
    dic_tokens = set()
    for line in lines[1:]:
        token = line.strip()
        if token:  # Skip empty lines
            # Clean Hunspell metadata: remove everything after the slash /
            if '/' in token:
                token = token.split('/')[0]
            dic_tokens.add(token.lower())  # Convert to lowercase for case-insensitive matching
    
    print(f"Loaded {len(dic_tokens)} unique tokens from dictionary file (cleaned of Hunspell metadata)")
    
    # Filter txt tokens - remove those that appear in dic_tokens (case-insensitive)
    filtered_tokens = []
    removed_count = 0
    sample_removals = []  # Track some examples for demonstration
    
    for token in txt_tokens:
        if token.lower() in dic_tokens:  # Case-insensitive comparison
            removed_count += 1
            # Store first 10 examples for demonstration
            if len(sample_removals) < 10:
                sample_removals.append(token)
        else:
            filtered_tokens.append(token)
    
    # Show some examples of removed tokens
    if sample_removals:
        print(f"Sample removed tokens (case-insensitive matching): {', '.join(sample_removals[:5])}{'...' if len(sample_removals) > 5 else ''}")
    
    print(f"Removed {removed_count} tokens that appear in dictionary")
    print(f"Remaining tokens: {len(filtered_tokens)}")
    
    # Write filtered tokens as dictionary file
    with open(output_dic_path, 'w', encoding='utf-8') as f:
        # Write token count as first line
        f.write(str(len(filtered_tokens)) + '\n')
        # Write remaining tokens
        for token in filtered_tokens:
            f.write(token + '\n')
    
    print(f"Filtered tokens saved as dictionary to: {output_dic_path}")
    
    return {
        'original_txt_tokens': len(txt_tokens),
        'dic_tokens': len(dic_tokens),
        'removed_tokens': removed_count,
        'remaining_tokens': len(filtered_tokens)
    }

def test_case_insensitive_matching():
    """Test function to demonstrate case-insensitive matching"""
    print("="*60)
    print("TESTING CASE-INSENSITIVE MATCHING")
    print("="*60)
    
    # Create test files
    test_txt_content = """Abanico
abanico
CASA
Casa
mesa
MESA
Libro
libro"""
    
    test_dic_content = """8
abanico/HS
casa/S
mesa/GS
libro/MS
otro/S
palabra/S
ejemplo/MS
test/S"""
    
    # Write test files
    with open("test_tokens.txt", "w", encoding="utf-8") as f:
        f.write(test_txt_content)
    
    with open("test_dict.dic", "w", encoding="utf-8") as f:
        f.write(test_dic_content)
    
    print("Created test files:")
    print("test_tokens.txt:", test_txt_content.replace('\n', ', '))
    print("test_dict.dic content (after first line):", "abanico/HS, casa/S, mesa/GS, libro/MS, ...")
    
    # Test the function
    try:
        result = filter_tokens_by_dictionary("test_tokens.txt", "test_dict.dic", "test_output.dic")
        
        print(f"\nResults:")
        print(f"Original txt tokens: {result['original_txt_tokens']}")
        print(f"Dictionary tokens: {result['dic_tokens']}")
        print(f"Removed tokens: {result['removed_tokens']}")
        print(f"Remaining tokens: {result['remaining_tokens']}")
        
        # Show the output file
        with open("test_output.dic", "r", encoding="utf-8") as f:
            output_content = f.read().strip()
        print(f"\nOutput file content:")
        print(output_content)
        
        # Clean up test files
        import os
        for file in ["test_tokens.txt", "test_dict.dic", "test_output.dic"]:
            if os.path.exists(file):
                os.remove(file)
        
        print("\nExpected behavior:")
        print("- 'Abanico' and 'abanico' should be removed (matches 'abanico/HS')")
        print("- 'CASA' and 'Casa' should be removed (matches 'casa/S')")
        print("- 'mesa' and 'MESA' should be removed (matches 'mesa/GS')")
        print("- 'Libro' and 'libro' should be removed (matches 'libro/MS')")
        print("- No tokens should remain since all match dictionary entries")
        
    except Exception as e:
        print(f"Error in test: {e}")
    
    print("="*60)

# Test function with sample paths
print("="*60)
print("TESTING WITH REAL DATA")
print("="*60)

try:
    result = filter_tokens_by_dictionary(PATH_Ankama_tokens, PATH_Hunspell_tokens, r"C:\Users\Nelso\Downloads\filtered_spanish_tokens.dic")
    
    print("\nFilter result:")
    print(f"Original txt tokens: {result['original_txt_tokens']}")
    print(f"Dictionary tokens: {result['dic_tokens']}")
    print(f"Removed tokens: {result['removed_tokens']}")
    print(f"Remaining tokens: {result['remaining_tokens']}")
    
except Exception as e:
    print(f"Error during filtering: {e}")

# Test case-insensitive matching with examples
test_case_insensitive_matching()

TESTING WITH REAL DATA
Reading tokens from: C:\Users\Nelso\Downloads\merged_spanish_tokens.txt
Loaded 37426 tokens from txt file
Reading dictionary from: C:\Users\Nelso\Downloads\es_dicts\es\es_ES.dic
Dictionary token count: 58221
Loaded 55638 unique tokens from dictionary file (cleaned of Hunspell metadata)
Sample removed tokens (case-insensitive matching): Abajo, Abandonar, Abanico, Abdominal, Abel...
Removed 11211 tokens that appear in dictionary
Remaining tokens: 26215
Filtered tokens saved as dictionary to: C:\Users\Nelso\Downloads\filtered_spanish_tokens.dic

Filter result:
Original txt tokens: 37426
Dictionary tokens: 55638
Removed tokens: 11211
Remaining tokens: 26215
TESTING CASE-INSENSITIVE MATCHING
Created test files:
test_tokens.txt: Abanico, abanico, CASA, Casa, mesa, MESA, Libro, libro
test_dict.dic content (after first line): abanico/HS, casa/S, mesa/GS, libro/MS, ...
Reading tokens from: test_tokens.txt
Loaded 8 tokens from txt file
Reading dictionary from: test_dict.di

## Filtering v2.0
This new algorithm includes morphological patterns of the AFF files to improve the matching rules and remove more common language words from the Ankama dictionary.
* Hunspell resources : https://hunspell.memoq.com/
* AFF (affix morphological patterns) documentation : https://manpages.ubuntu.com/manpages/focal/man5/hunspell.5.html

In [14]:
import re
from typing import Set, Dict, List, Tuple

def parse_aff_file(aff_file_path: str) -> Dict:
    """Parse Hunspell .aff file and extract affix rules"""
    affixes = {'PFX': {}, 'SFX': {}}
    
    with open(aff_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    current_affix = None
    current_type = None
    
    for line in lines:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
            
        parts = line.split()
        if not parts:
            continue
            
        # Parse prefix/suffix header definitions (e.g., "PFX a Y 2")
        if parts[0] in ['PFX', 'SFX'] and len(parts) >= 3:
            affix_type = parts[0]
            flag = parts[1]
            cross_product = parts[2] == 'Y'
            
            # Check if this is a header line (has count) or rule line
            if len(parts) >= 4:
                try:
                    # Try to parse as count - if successful, this is a header line
                    count = int(parts[3])
                    # This is a header line
                    if flag not in affixes[affix_type]:
                        affixes[affix_type][flag] = {
                            'cross_product': cross_product,
                            'rules': []
                        }
                    current_affix = flag
                    current_type = affix_type
                    continue
                except ValueError:
                    # Not a number, so this is a rule line
                    pass
            
            # Parse affix rule: PFX/SFX flag strip add condition
            if len(parts) >= 4 and current_affix == flag and current_type == affix_type:
                strip = parts[2] if parts[2] != '0' else ''
                add = parts[3] if parts[3] != '0' else ''
                condition = parts[4] if len(parts) > 4 else '.'
                
                if current_affix in affixes[current_type]:
                    affixes[current_type][current_affix]['rules'].append({
                        'strip': strip,
                        'add': add,
                        'condition': condition
                    })
    
    return affixes

def condition_matches(word: str, condition: str, is_prefix: bool = True) -> bool:
    """Check if word matches the affix condition pattern"""
    if condition == '.':
        return True
    
    try:
        if is_prefix:
            # For prefixes, check the beginning of the word
            return bool(re.match(f'^{condition}', word))
        else:
            # For suffixes, check the end of the word
            return bool(re.search(f'{condition}$', word))
    except re.error:
        # If regex fails, do simple string matching
        if is_prefix:
            return word.startswith(condition.replace('[^', '').replace(']', ''))
        else:
            return word.endswith(condition.replace('[^', '').replace(']', ''))

def generate_word_forms(base_word: str, flags: str, affixes: Dict) -> Set[str]:
    """Generate all possible word forms using affix rules"""
    word_forms = {base_word}  # Always include the base word
    
    if not flags:
        return word_forms
    
    # Process each flag character
    for flag in flags:
        # Apply prefixes
        if flag in affixes['PFX']:
            prefix_rules = affixes['PFX'][flag]['rules']
            for rule in prefix_rules:
                if condition_matches(base_word, rule['condition'], is_prefix=True):
                    # Apply prefix rule
                    if rule['strip']:
                        if base_word.startswith(rule['strip']):
                            modified_word = rule['add'] + base_word[len(rule['strip']):]
                            word_forms.add(modified_word)
                    else:
                        modified_word = rule['add'] + base_word
                        word_forms.add(modified_word)
        
        # Apply suffixes
        if flag in affixes['SFX']:
            suffix_rules = affixes['SFX'][flag]['rules']
            for rule in suffix_rules:
                if condition_matches(base_word, rule['condition'], is_prefix=False):
                    # Apply suffix rule
                    if rule['strip']:
                        if base_word.endswith(rule['strip']):
                            modified_word = base_word[:-len(rule['strip'])] + rule['add']
                            word_forms.add(modified_word)
                    else:
                        modified_word = base_word + rule['add']
                        word_forms.add(modified_word)
    
    return word_forms

def filter_tokens_by_dictionary_with_affixes(txt_file_path: str, dic_file_path: str, aff_file_path: str, output_dic_path: str):
    """
    Enhanced version that uses Hunspell affix rules for better matching
    
    Args:
        txt_file_path: Path to the txt file with tokens (one per line)
        dic_file_path: Path to the dic file (first line is token count, rest are tokens)
        aff_file_path: Path to the .aff file with affix rules
        output_dic_path: Path where the filtered dic file will be saved
    """
    if not os.path.exists(txt_file_path):
        raise FileNotFoundError(f"Token file not found: {txt_file_path}")
    
    if not os.path.exists(dic_file_path):
        raise FileNotFoundError(f"Dictionary file not found: {dic_file_path}")
        
    if not os.path.exists(aff_file_path):
        raise FileNotFoundError(f"Affix file not found: {aff_file_path}")
    
    # Parse affix rules
    print(f"Parsing affix rules from: {aff_file_path}")
    affixes = parse_aff_file(aff_file_path)
    prefix_count = sum(len(rules['rules']) for rules in affixes['PFX'].values())
    suffix_count = sum(len(rules['rules']) for rules in affixes['SFX'].values())
    print(f"Loaded {len(affixes['PFX'])} prefix flags ({prefix_count} rules) and {len(affixes['SFX'])} suffix flags ({suffix_count} rules)")
    
    # Read tokens from txt file - preserve original case
    print(f"Reading tokens from: {txt_file_path}")
    original_txt_tokens = []  # Keep original case
    with open(txt_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            token = line.strip()
            if token:
                original_txt_tokens.append(token)  # Preserve original case
    
    print(f"Loaded {len(original_txt_tokens)} tokens from txt file")
    
    # Read dictionary file and generate all word forms
    print(f"Reading dictionary and generating word forms from: {dic_file_path}")
    with open(dic_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    if not lines:
        raise ValueError("Dictionary file is empty")
    
    # First line is the token count
    original_count = lines[0].strip()
    print(f"Dictionary token count: {original_count}")
    
    # Generate all possible word forms from dictionary (in lowercase for matching)
    all_dictionary_forms = set()
    processed_entries = 0
    
    for line in lines[1:]:
        line = line.strip()
        if not line:
            continue
            
        processed_entries += 1
        if processed_entries % 1000 == 0:
            print(f"Processed {processed_entries} dictionary entries...")
        
        # Parse dictionary entry
        if '/' in line:
            base_word, flags = line.split('/', 1)
        else:
            base_word, flags = line, ''
        
        # Generate all word forms for this base word (lowercase for matching)
        word_forms = generate_word_forms(base_word.lower(), flags, affixes)
        all_dictionary_forms.update(word_forms)
    
    print(f"Generated {len(all_dictionary_forms)} unique word forms from {processed_entries} dictionary entries")
    
    # Filter txt tokens - remove those that match any dictionary form
    # Compare lowercase versions but keep original case for output
    filtered_tokens = []
    removed_count = 0
    sample_removals = []
    
    for original_token in original_txt_tokens:  # Use original case tokens
        if original_token.lower() in all_dictionary_forms:  # Compare with lowercase
            removed_count += 1
            if len(sample_removals) < 10:
                sample_removals.append(original_token)  # Show original case in samples
        else:
            filtered_tokens.append(original_token)  # Keep original case
    
    # Show some examples of removed tokens
    if sample_removals:
        print(f"Sample removed tokens: {', '.join(sample_removals[:5])}{'...' if len(sample_removals) > 5 else ''}")
    
    print(f"Removed {removed_count} tokens that match dictionary word forms")
    print(f"Remaining tokens: {len(filtered_tokens)}")
    
    # Write filtered tokens as dictionary file (preserving original case)
    with open(output_dic_path, 'w', encoding='utf-8') as f:
        f.write(str(len(filtered_tokens)) + '\n')
        for token in filtered_tokens:  # These already have original case
            f.write(token + '\n')
    
    print(f"Filtered tokens saved as dictionary to: {output_dic_path}")
    
    return {
        'original_txt_tokens': len(original_txt_tokens),
        'dictionary_base_words': processed_entries,
        'generated_word_forms': len(all_dictionary_forms),
        'removed_tokens': removed_count,
        'remaining_tokens': len(filtered_tokens)
    }

# Test the enhanced function
print("="*70)
print("TESTING ENHANCED DICTIONARY FILTERING WITH AFFIX RULES")
print("="*70)

# Example usage
AFF_FILE_PATH = r"C:\Users\Nelso\Downloads\es_dicts\es\es_ES.aff"  # Path to .aff file

if os.path.exists(AFF_FILE_PATH):
    try:
        result = filter_tokens_by_dictionary_with_affixes(
            PATH_Ankama_tokens,      # txt file with tokens to filter
            PATH_Hunspell_tokens,    # dic file
            AFF_FILE_PATH,           # aff file with rules
            r"C:\Users\Nelso\Downloads\enhanced_filtered_tokens.dic"
        )
        
        print("\nENHANCED FILTERING RESULTS:")
        print("="*50)
        print(f"Original txt tokens: {result['original_txt_tokens']}")
        print(f"Dictionary base words: {result['dictionary_base_words']}")
        print(f"Generated word forms: {result['generated_word_forms']}")
        print(f"Removed tokens: {result['removed_tokens']}")
        print(f"Remaining tokens: {result['remaining_tokens']}")
        
        # Calculate improvement
        improvement = result['generated_word_forms'] - result['dictionary_base_words']
        print(f"Affix expansion factor: {result['generated_word_forms'] / result['dictionary_base_words']:.2f}x")
        print(f"Additional word forms from affixes: {improvement}")
        
    except Exception as e:
        print(f"Error: {e}")
else:
    print(f"Affix file not found: {AFF_FILE_PATH}")
    print("Please provide the correct path to the .aff file")

TESTING ENHANCED DICTIONARY FILTERING WITH AFFIX RULES
Parsing affix rules from: C:\Users\Nelso\Downloads\es_dicts\es\es_ES.aff
Loaded 29 prefix flags (80 rules) and 70 suffix flags (6650 rules)
Reading tokens from: C:\Users\Nelso\Downloads\merged_spanish_tokens.txt
Loaded 37426 tokens from txt file
Reading dictionary and generating word forms from: C:\Users\Nelso\Downloads\es_dicts\es\es_ES.dic
Dictionary token count: 58221
Processed 1000 dictionary entries...
Processed 2000 dictionary entries...
Processed 3000 dictionary entries...
Processed 4000 dictionary entries...
Processed 3000 dictionary entries...
Processed 4000 dictionary entries...
Processed 5000 dictionary entries...
Processed 6000 dictionary entries...
Processed 5000 dictionary entries...
Processed 6000 dictionary entries...
Processed 7000 dictionary entries...
Processed 8000 dictionary entries...
Processed 9000 dictionary entries...
Processed 7000 dictionary entries...
Processed 8000 dictionary entries...
Processed 9000 d

# Enhanced Language File Processor - Complete Summary

## Features

The script now includes **comprehensive filtering** with multiple advanced conditions to ensure high-quality token extraction.

### Supported File Types
- **Excel files** (`.xlsx`, `.xls`): Language code as column name
- **XLIFF files** (`.xliff`, `.xlf`, `.xml`): Language code in `source-language` or `target-language` attributes

### Key Functionality
1. **File Type Detection**: Automatically detects file type based on extension
2. **Language Matching**: 
   - Excel: Extracts from column matching the language code
   - XLIFF: Extracts from `<source>` or `<target>` elements based on language attributes

### **COMPREHENSIVE Filtering System**
3. **Square Bracket Filtering**: Ignores entries where source text contains `[.+]` pattern
4. **Target = Source Filtering**: Ignores entries where target text equals source text
5. **All-Caps Target Filtering**: **NEW** - Ignores entries where target text is entirely in uppercase
6. **HTML Tag Removal**: **NEW** - Removes HTML tags and decodes HTML entities before tokenization
7. **Hyperlink & Email Removal**: Removes URLs and email addresses before tokenization
8. **Token Edge Cleaning**: **NEW** - Removes leading/trailing apostrophes and hyphens from tokens
9. **Short Token Filtering**: Removes tokens with length < 3 characters
10. **Same Character Chain Filtering**: Removes tokens that are chains of the same character (e.g., "aaa", "zzZZzz")
11. **Number-Only Token Filtering**: **NEW** - Removes tokens that consist only of digits
12. **Time Pattern Filtering**: **NEW** - Removes tokens matching `\d+(PA|PM|AM|AL)` pattern
13. **Digit-Word Pattern Filtering**: **NEW** - Removes tokens matching `\d+-\w+` pattern (e.g., "123-neutral")
14. **Enhanced Punctuation**: **NEW** - Includes º character in punctuation list
15. **Tokenization**: Splits by whitespace and punctuation, preserving hyphens (`-`) and apostrophes (`'`)
16. **Export**: Saves unique tokens (case-sensitive) to text file, one per line

### Usage
```python
# Basic usage
tokens = process_file(file_path, language_code)

# With custom output path
tokens = process_file(file_path, language_code, output_path)
```

### Example Advanced Filtering Results
**Input Processing:**
- ✅ **"Hola mundo"** → `['Hola', 'mundo']`
- ❌ **"[Debug] test"** → Skipped (square brackets in source)
- ❌ **"Same text"** → Skipped (target equals source)
- ❌ **"TODO EN MAYÚSCULAS"** → Skipped (all caps target)
- ✅ **HTML content** → Tags removed, entities decoded
- ✅ **"'Resistencia 'Robo'"** → `['Resistencia', 'Robo']` (edges cleaned)
- ❌ **Number tokens: "123", "456"** → Filtered out (numbers only)
- ❌ **Time patterns: "3PM", "10AM"** → Filtered out (time pattern)
- ❌ **Digit-word: "123-neutral"** → Filtered out (digit-word pattern)
- ✅ **"25º celsius"** → `['celsius']` (º treated as punctuation)

**Final Result:** Only meaningful, clean tokens ≥ 3 characters from appropriate entries

# Draft tests (unitary tests TODO)

In [None]:
# Quick verification test for the fixes
print("="*50)
print("VERIFYING FIXES")
print("="*50)

# Test the corrected time pattern function
test_time_tokens = ["3PM", "10AM", "5PA", "12AL"]
print("Testing corrected time patterns:")
for token in test_time_tokens:
    matches = matches_time_pattern(token)
    print(f"'{token}' -> matches time pattern: {matches}")

# Test º character removal
test_text = "Temperature: 25º celsius"
print(f"\nTesting º removal:")
print(f"Original: '{test_text}'")
tokens = tokenize_text(test_text)
print(f"Tokens: {sorted(tokens)}")

# Test all filtering combined
test_combined_text = "Meeting at 3PM, temperature 25º, status: 123-neutral, numbers 456"
print(f"\nTesting combined filtering:")
print(f"Original: '{test_combined_text}'")
tokens_combined = tokenize_text(test_combined_text)
print(f"Final tokens: {sorted(tokens_combined)}")

print("\nExpected results:")
print("- Time patterns (3PM) should be filtered out")
print("- Numbers (456) should be filtered out") 
print("- Digit-word patterns (123-neutral) should be filtered out")
print("- º should be treated as punctuation")
print("- Only meaningful words should remain")

In [None]:
# Test the fix for HTML br and p tag handling
print("="*60)
print("TESTING HTML BR AND P TAG HANDLING FIX")
print("="*60)

# Test cases that demonstrate the issue and fix
test_html_cases = [
    "Ankama&lt;br&gt;&lt;br&gt;1.",
    "Word1&lt;br&gt;Word2",
    "Start&lt;p&gt;Middle&lt;/p&gt;End",
    "Text&lt;br/&gt;More text",
    "Line1&lt;BR&gt;Line2",  # Test case insensitive
    "Para&lt;P class='test'&gt;Content&lt;/P&gt;After",
    "Normal text without HTML tags"
]

print("Testing HTML tag removal with br/p handling:")
for text in test_html_cases:
    cleaned = remove_html_tags(text)
    tokens = tokenize_text(text)
    print(f"Original: '{text}'")
    print(f"Cleaned:  '{cleaned}'")
    print(f"Tokens:   {sorted(tokens)}")
    print()

# Specific test for the reported issue
print("="*40)
print("SPECIFIC TEST FOR REPORTED ISSUE")
print("="*40)

issue_text = "Ankama&lt;br&gt;&lt;br&gt;1."
print(f"Testing: '{issue_text}'")

# Before fix (simulate): would result in "Ankama1"
# After fix: should result in separate tokens
cleaned_text = remove_html_tags(issue_text)
final_tokens = tokenize_text(issue_text)

print(f"HTML removed: '{cleaned_text}'")
print(f"Final tokens: {sorted(final_tokens)}")
print(f"✅ Issue fixed: 'Ankama' and other meaningful tokens are separate" if 'Ankama' in final_tokens else "❌ Issue not fixed")

# Test with a more complex example
complex_html = "Company&lt;br&gt;&lt;br&gt;Address&lt;p&gt;City&lt;/p&gt;Country123"
print(f"\nComplex example: '{complex_html}'")
complex_tokens = tokenize_text(complex_html)
print(f"Tokens: {sorted(complex_tokens)}")
print("Expected: Company, Address, City, Country123 should be separate tokens")

In [None]:
# Test the new ignore_identical_translation parameter
print("="*70)
print("TESTING ignore_identical_translation PARAMETER")
print("="*70)

# Create test data with identical translations
test_data_identical = {
    'key': ['greeting', 'same1', 'same2', 'different'],
    'fr-fr': ['Bonjour', 'Same Text', 'Identical', 'Source Text'],
    'es-es': ['Hola', 'Same Text', 'Identical', 'Target Text']  # First two are identical to source
}

df_identical = pd.DataFrame(test_data_identical)
df_identical.to_excel("test_identical.xlsx", index=False)
print("Test Excel file with identical translations created!")
print("Test data:")
print(df_identical.to_string(index=False))

# Test with ignore_identical_translation=True (default)
print(f"\n1. Testing with ignore_identical_translation=True (default):")
try:
    tokens_ignore_true = process_file("test_identical.xlsx", "es-es", "tokens_ignore_true.txt")
    print(f"Tokens with ignore=True: {sorted(tokens_ignore_true)}")
    print("Expected: 'Same Text' and 'Identical' should be skipped")
except Exception as e:
    print(f"Error: {e}")

# Test with ignore_identical_translation=False
print(f"\n2. Testing with ignore_identical_translation=False:")
try:
    tokens_ignore_false = process_file("test_identical.xlsx", "es-es", "tokens_ignore_false.txt", ignore_identical_translation=False)
    print(f"Tokens with ignore=False: {sorted(tokens_ignore_false)}")
    print("Expected: 'Same Text' and 'Identical' should be included")
except Exception as e:
    print(f"Error: {e}")

# Show the difference
if 'tokens_ignore_true' in locals() and 'tokens_ignore_false' in locals():
    additional_tokens = tokens_ignore_false - tokens_ignore_true
    print(f"\nAdditional tokens when ignore_identical_translation=False: {sorted(additional_tokens)}")

# Also test with XLIFF
test_xliff_identical = """<?xml version="1.0" encoding="UTF-8"?>
<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">
    <file datatype="plaintext" original="test" source-language="fr-fr" target-language="es-es">
        <body>
            <trans-unit id="test.1">
                <source>Hello World</source>
                <target>Hola Mundo</target>
            </trans-unit>
            <trans-unit id="test.2">
                <source>Same Text</source>
                <target>Same Text</target>
            </trans-unit>
            <trans-unit id="test.3">
                <source>Identical</source>
                <target>Identical</target>
            </trans-unit>
        </body>
    </file>
</xliff>"""

with open("test_identical.xliff", "w", encoding="utf-8") as f:
    f.write(test_xliff_identical)

print(f"\n3. Testing XLIFF with ignore_identical_translation=True:")
try:
    xliff_tokens_true = process_file("test_identical.xliff", "es-es", "xliff_tokens_true.txt")
    print(f"XLIFF tokens with ignore=True: {sorted(xliff_tokens_true)}")
except Exception as e:
    print(f"Error: {e}")

print(f"\n4. Testing XLIFF with ignore_identical_translation=False:")
try:
    xliff_tokens_false = process_file("test_identical.xliff", "es-es", "xliff_tokens_false.txt", ignore_identical_translation=False)
    print(f"XLIFF tokens with ignore=False: {sorted(xliff_tokens_false)}")
except Exception as e:
    print(f"Error: {e}")

# Clean up test files
print("\nCleaning up test files...")
test_files = [
    "test_identical.xlsx", "test_identical.xliff",
    "tokens_ignore_true.txt", "tokens_ignore_false.txt",
    "xliff_tokens_true.txt", "xliff_tokens_false.txt"
]
for file in test_files:
    if os.path.exists(file):
        os.remove(file)
        print(f"Removed: {file}")

print("\nParameter test completed!")
print("\nSUMMARY:")
print("- ignore_identical_translation=True (default): Skips entries where target equals source")
print("- ignore_identical_translation=False: Includes all entries, even identical translations")
print("- This allows users to control whether to include identical translations in their token extraction")