In [2]:
import polars as pl
import re
from tqdm import tqdm
import time

# Function to correct geoname casing with optimized rules
def correct_geoname_casing(corrected_term):
    if not corrected_term or not isinstance(corrected_term, str):
        return corrected_term
    
    # EARLY EXIT 1: If term is already properly capitalized (first letter uppercase, not all caps)
    if corrected_term and corrected_term[0].isupper() and not corrected_term.isupper():
        # Check if it contains mixed casing errors (like "asSyrians" or "balanites aEgyptiaca")
        has_mixed_case_error = False
        words = corrected_term.split()
        for word in words:
            # Look for patterns like "asSyrians" (lowercase start + uppercase in middle)
            if len(word) > 1 and word[0].islower() and any(c.isupper() for c in word[1:]):
                has_mixed_case_error = True
                break
            # Look for patterns like "aEgyptiaca" (lowercase + uppercase in middle)
            if len(word) > 2 and any(word[i].islower() and word[i+1].isupper() for i in range(len(word)-1)):
                has_mixed_case_error = True
                break
        
        # If no mixed case errors and not all uppercase, likely already correct
        if not has_mixed_case_error:
            # Still check language patterns and geoname corrections
            result = corrected_term
            
            # Apply language pattern correction
            match = language_pattern.search(result)
            if match:
                prefix = match.group(1)
                suffix = match.group(2)
                # Capitalize the prefix (language name), handling hyphens
                words = prefix.replace('-', ' - ').split()
                capitalized_words = []
                for word in words:
                    if word == '-':
                        capitalized_words.append('-')
                    else:
                        capitalized_words.append(word.capitalize())
                new_prefix = "".join(capitalized_words).replace(' - ', '-')
                result = result.replace(match.group(0), f"{new_prefix} {suffix}")
            
            # Apply geoname corrections only if the term might contain geonames
            # BUT with stricter exact matching only
            result_lower = result.lower()
            if result_lower in geoname_case_mapping:
                return geoname_case_mapping[result_lower]
            
            # For multi-word terms, apply word-by-word corrections with proper boundary checking
            words = result.split()
            corrected_words = []
            for word in words:
                clean_word = re.sub(r'[^\w\-]', '', word)
                punctuation = re.sub(r'[\w\-]', '', word)
                clean_word_lower = clean_word.lower()
                # Only match complete words, never substrings
                if clean_word_lower in geoname_case_mapping and clean_word_lower == clean_word.lower():
                    corrected_words.append(geoname_case_mapping[clean_word_lower] + punctuation)
                elif clean_word_lower in geoname_plurals and clean_word_lower == clean_word.lower():
                    corrected_words.append(geoname_plurals[clean_word_lower] + punctuation)
                else:
                    # Hyphenated compounds
                    if '-' in clean_word and len(clean_word.split('-')) == 2:
                        parts = clean_word.split('-')
                        corrected_parts = []
                        for part in parts:
                            part_lower = part.lower()
                            if part_lower in geoname_case_mapping and part_lower == part.lower():
                                corrected_parts.append(geoname_case_mapping[part_lower])
                            else:
                                corrected_parts.append(part)
                        corrected_words.append('-'.join(corrected_parts) + punctuation)
                    else:
                        corrected_words.append(word)
            return " ".join(corrected_words)
        else:
            # If mixed case error, DO NOT attempt substring replacement!
            # Just return the original term, or apply only strict word-boundary corrections
            words = corrected_term.split()
            corrected_words = []
            for word in words:
                clean_word = re.sub(r'[^\w\-]', '', word)
                punctuation = re.sub(r'[\w\-]', '', word)
                clean_word_lower = clean_word.lower()
                if clean_word_lower in geoname_case_mapping and clean_word_lower == clean_word.lower():
                    corrected_words.append(geoname_case_mapping[clean_word_lower] + punctuation)
                elif clean_word_lower in geoname_plurals and clean_word_lower == clean_word.lower():
                    corrected_words.append(geoname_plurals[clean_word_lower] + punctuation)
                else:
                    if '-' in clean_word and len(clean_word.split('-')) == 2:
                        parts = clean_word.split('-')
                        corrected_parts = []
                        for part in parts:
                            part_lower = part.lower()
                            if part_lower in geoname_case_mapping and part_lower == part.lower():
                                corrected_parts.append(geoname_case_mapping[part_lower])
                            else:
                                corrected_parts.append(part)
                        corrected_words.append('-'.join(corrected_parts) + punctuation)
                    else:
                        corrected_words.append(word)
            return " ".join(corrected_words)
    
    # EARLY EXIT 2: For simple single-word lowercase terms
    if ' ' not in corrected_term and corrected_term.islower():
        # Check if it's a simple geoname replacement (exact match only)
        lower_term = corrected_term.lower()
        if lower_term in geoname_case_mapping:
            return geoname_case_mapping[lower_term]
        elif lower_term in geoname_plurals:
            return geoname_plurals[lower_term]
        else:
            return corrected_term  # <-- This ensures "formalities" stays "formalities"

    
    result = corrected_term
    
    # Apply language pattern correction (handles hyphens now)
    match = language_pattern.search(result)
    if match:
        prefix = match.group(1)
        suffix = match.group(2)
        # Capitalize the prefix (language name), handling hyphens properly
        words = prefix.replace('-', ' - ').split()
        capitalized_words = []
        for word in words:
            if word == '-':
                capitalized_words.append('-')
            else:
                capitalized_words.append(word.capitalize())
        new_prefix = "".join(capitalized_words).replace(' - ', '-')
        result = result.replace(match.group(0), f"{new_prefix} {suffix}")
    
    # Apply geoname replacements if likely to contain geonames
    result_lower = result.lower()
    
    # Check for exact matches first (entire term)
    if result_lower in geoname_case_mapping:
        return geoname_case_mapping[result_lower]
    
    # For multi-word terms, apply word-by-word corrections ONLY
    # This prevents substring matching within words
    words = result.split()
    if len(words) == 1 and words[0].islower() and words[0].lower() not in geoname_case_mapping and words[0].lower() not in geoname_plurals:
        return corrected_term
    corrected_words = []
    word_changed = False
    
    for word in words:
        # Remove punctuation for matching but preserve it in output
        clean_word = re.sub(r'[^\w\-]', '', word)
        punctuation = re.sub(r'[\w\-]', '', word)
        
        clean_word_lower = clean_word.lower()
        
        # Only match complete words, never substrings
        if clean_word_lower in geoname_case_mapping:
            corrected_words.append(geoname_case_mapping[clean_word_lower] + punctuation)
            word_changed = True
        elif clean_word_lower in geoname_plurals:
            corrected_words.append(geoname_plurals[clean_word_lower] + punctuation)
            word_changed = True
        else:
            # Check for hyphenated compounds (like "french-syrian")
            if '-' in clean_word and len(clean_word.split('-')) == 2:
                parts = clean_word.split('-')
                corrected_parts = []
                for part in parts:
                    part_lower = part.lower()
                    if part_lower in geoname_case_mapping:
                        corrected_parts.append(geoname_case_mapping[part_lower])
                        word_changed = True
                    else:
                        corrected_parts.append(part)
                corrected_words.append('-'.join(corrected_parts) + punctuation)
            else:
                corrected_words.append(word)
    
    return " ".join(corrected_words)

def fix_middle_cased_word(word):
            # If all caps, keep as is
            if word.isupper():
                return word
            # If first char is uppercase and rest are lowercase, keep as is
            if len(word) > 1 and word[0].isupper() and word[1:].islower():
                return word
            # If any uppercase in the middle, set them to lowercase
            chars = [word[0]]
            for c, prev in zip(word[1:], word[:-1]):
                if c.isupper() and not prev == ' ':
                    chars.append(c.lower())
                else:
                    chars.append(c)
            return ''.join(chars)

def fix_middle_cased_text(text):
    # Only fix words with mixed casing, not all caps
    def fix_word(word):
        # Remove punctuation for checking
        clean = re.sub(r'[^\w\-]', '', word)
        if len(clean) > 1 and any(c.isupper() for c in clean[1:]) and not clean.isupper():
            # Fix only the clean part, then re-attach punctuation
            fixed = fix_middle_cased_word(clean)
            return word.replace(clean, fixed)
        return word
    return ' '.join([fix_word(w) for w in text.split()])

In [21]:

# Load geonames for exclusion
geonames_file_path = "data/M49_countries.xlsx"
geonames_df = pl.read_excel(geonames_file_path)
en_corrections2 = pl.read_excel("data/en_corrections_processed_lemmatized2.xlsx")
# Check if en_corrections2 exists in local variables
if 'en_corrections2' not in locals():
    en_corrections2 = pl.read_excel("data/en_corrections_processed_lemmatized2.xlsx")
    

# Load the emoji country nationality list to enrich the geonames data
emoji_countries_df = pl.read_csv("data/emoji_country_nationality_list.csv")

# Create a comprehensive list of all geographical names from both sources
english_geonames = []

# Get existing English geonames from M49 countries
if "English" in geonames_df.columns:
    existing_english = geonames_df.select("English").filter(
        pl.col("English").is_not_null()
    ).to_series().to_list()
    english_geonames.extend(existing_english)

# Add country names from emoji list
country_names = emoji_countries_df.select("Name").filter(
    pl.col("Name").is_not_null()
).to_series().to_list()
english_geonames.extend(country_names)

# Add all demonyms from emoji list (Demonym 1, 2, 3)
for demonym_col in ["Demonym 1", "Demonym 2", "Demonym 3"]:
    demonyms = emoji_countries_df.select(demonym_col).filter(
        (pl.col(demonym_col).is_not_null()) &
        (pl.col(demonym_col).str.strip_chars() != "")
    ).to_series().to_list()
    english_geonames.extend(demonyms)

# Remove duplicates and empty strings
english_geonames = list(set([name for name in english_geonames if name and str(name).strip()]))

print(f"Total geographical names for exclusion: {len(english_geonames)}")
print(f"Sample geonames: {english_geonames[:10]}")

# Check if en_corrections2 exists
if 'en_corrections2' in locals():
    print("Processing English corrections...")
    print(f"en_corrections2 has {en_corrections2.shape[0]} rows")
    
    # Add additional terms to geonames list (including new Latin America terms)
    additional_geonames = [
        "Asian", "European", "African", "American", "Australian", "Antarctic",
        "Latin America", "Latin American", "CFA", "United States", "Polynesian", "Tibetan",
        "Salvadorian", "Salvadoran"
    ]
    english_geonames.extend(additional_geonames)
    
    # Remove duplicates and empty strings
    english_geonames = list(set([name for name in english_geonames if name and str(name).strip()]))
    
    # OPTIMIZATION 1: Create comprehensive lookup sets
    geoname_case_mapping = {name.lower(): name for name in english_geonames if name}
    geoname_lower_set = set(geoname_case_mapping.keys())
    
    # OPTIMIZATION 2: Pre-compile regex patterns for common cases
    # Updated pattern to include hyphenated words before "language(s)"
    language_pattern = re.compile(r'\b([\w\-]+(?:\s+[\w\-]+)*)\s+(languages?)\b', re.IGNORECASE)
    
    # OPTIMIZATION 3: Create optimized lookup for plurals (geoname + 's')
    geoname_plurals = {(name.lower() + 's'): (name + 's') for name in english_geonames if name}
    
    # Use Polars for vectorized processing
    corrected_terms_series = en_corrections2.select("corrected").to_series()
    
    print(f"Starting geoname case correction on {len(corrected_terms_series)} terms...")
    
    start_time = time.time()
    
    # Use Polars map_elements with optimized function
    try:
        # Try vectorized approach first
        print("Attempting vectorized processing...")
        en_corrections_low2 = en_corrections2.with_columns(
            pl.col("corrected").map_elements(
                correct_geoname_casing,
                return_dtype=pl.Utf8
            ).alias("corrected")
        )
        

        # Apply the fix to the corrected column
        en_corrections_low2 = en_corrections_low2.with_columns(
            pl.col("corrected").map_elements(
                fix_middle_cased_text,
                return_dtype=pl.Utf8
            ).alias("corrected")
        )
        elapsed_time = time.time() - start_time
        print(f"Vectorized processing completed in {elapsed_time:.2f} seconds")
        
    except Exception as e:
        print(f"Vectorized processing failed: {e}")
        print("Falling back to iterative processing with optimizations...")
        
        # Fallback to optimized iterative processing
        corrected_terms = corrected_terms_series.to_list()
        corrected_results = []
        
        # Use tqdm with less frequent updates for better performance
        with tqdm(total=len(corrected_terms), 
                  desc="Processing corrections", 
                  unit="terms",
                  mininterval=1.0,  # Update every 1 second minimum
                  ncols=100) as pbar:
            
            batch_size = 1000  # Process in batches
            for i in range(0, len(corrected_terms), batch_size):
                batch = corrected_terms[i:i + batch_size]
                batch_results = [correct_geoname_casing(term) for term in batch]
                corrected_results.extend(batch_results)
                
                # Update progress bar
                pbar.update(len(batch))
                
                # Calculate and display timing info every batch
                if i % (batch_size * 5) == 0 or i + batch_size >= len(corrected_terms):
                    elapsed_time = time.time() - start_time
                    terms_per_second = (i + len(batch)) / elapsed_time
                    remaining_terms = len(corrected_terms) - (i + len(batch))
                    estimated_remaining_time = remaining_terms / terms_per_second if terms_per_second > 0 else 0
                    
                    # Update progress bar with timing info
                    pbar.set_postfix({
                        'rate': f'{terms_per_second:.1f} terms/sec',
                        'ETA': f'{estimated_remaining_time:.1f}s',
                        'changes': sum(1 for j in range(i + len(batch)) if corrected_results[j] != corrected_terms[j])
                    })
        
        elapsed_time = time.time() - start_time
        print(f"Iterative processing completed in {elapsed_time:.2f} seconds")
        print(f"Average rate: {len(corrected_terms) / elapsed_time:.1f} terms per second")
        
        # Create the corrected DataFrame
        en_corrections_low2 = en_corrections2.clone().with_columns(
            pl.Series("corrected", corrected_results)
        )
    
    # Save the corrected English data
    en_corrections_low2.write_excel("data/en_corrections_processed2.xlsx")
    print(f"English corrections saved to data/en_corrections_processed2.xlsx with {en_corrections_low2.shape[0]} rows")
    
    # Display summary of changes
    original_terms = en_corrections2.select("corrected").to_series()
    new_terms = en_corrections_low2.select("corrected").to_series()
    changes_made = (original_terms != new_terms).sum()
    print(f"Total corrections with changes: {changes_made}")
    
    # Show first few rows
    print("\nFirst few rows of processed corrections:")
    print(en_corrections_low2.head())
    
    # Test specific cases (updated test cases)
    test_cases = [
        "French", "Argentinians", "Baltic Sea", "Indian Sea", 
        "french", "argentinians", "baltic sea", "indian sea",
        "ASSYRIANS", "Assyrians", "asSyrians", "syrian", "syrians", 
        "french-syrian", "transfrench", "puerto ricans", "United States",
        "slavic languages", "budukh language", "english language", "ural-Altaic languages",
        "asian countries", "european union", "balanites aegyptiaca",
        "Latin America", "Latin American"
    ]
    print("\nTesting specific cases:")
    for test_case in test_cases:
        corrected = correct_geoname_casing(test_case)
        print(f"'{test_case}' -> '{corrected}'")
    
else:
    print("en_corrections2 not found. Please run the English batch processing first.")

Total geographical names for exclusion: 586
Sample geonames: ['Israeli', 'Kyrgyzstani', 'France', 'Croat', 'Egypt', 'Czech', 'Futunan', 'Montserratian', 'Myanmarese', 'Cambodia']
Processing English corrections...
en_corrections2 has 7279 rows
Starting geoname case correction on 7279 terms...
Attempting vectorized processing...
Vectorized processing completed in 0.18 seconds
English corrections saved to data/en_corrections_processed2.xlsx with 7279 rows
Total corrections with changes: 97

First few rows of processed corrections:
shape: (5, 6)
┌────────────────────┬────────────────────┬─────────┬───────────────┬─────────┬────────────────────┐
│ original           ┆ corrected          ┆ changed ┆ language_code ┆ column  ┆ lemmatized         │
│ ---                ┆ ---                ┆ ---     ┆ ---           ┆ ---     ┆ ---                │
│ str                ┆ str                ┆ bool    ┆ str           ┆ str     ┆ str                │
╞════════════════════╪════════════════════╪═════

## Lemmatization

In [13]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_md")

def spacy_lemmatize(text):
    doc = nlp(text)
    entity_indices = set()
    for ent in doc.ents:
        if ent.label_ in {"NORP", "LANGUAGE", "LOC"}:
            entity_indices.update(range(ent.start, ent.end))
    # Lemmatize only nouns and adjectives, except for named entities
    return ''.join([
        (token.text if i in entity_indices else (token.lemma_ if token.pos_ in {"NOUN", "ADJ"} else token.text)) + token.whitespace_
        for i, token in enumerate(doc)
    ])

# Sample test cases
sample_texts = [
    "folders",
    "the Albanian refugees",
    "Americans",
    "Syrian children",
    "formalities",
    "French folders",
    "French language",
    "Syrian locations",
    "burkina-be",
    "world war (1950-1953)",
    "Mi'kmaq language",
]

print("spaCy lemmatization samples:")
for text in sample_texts:
    print(f"{text} -> {spacy_lemmatize(text)}")

# Apply spaCy lemmatization (with NORP, LANGUAGE, LOC exception) to the "corrected" column
en_corrections_lowLemmatized = en_corrections_low2.with_columns(
    pl.col("corrected").map_elements(
        spacy_lemmatize,
        return_dtype=pl.Utf8
    ).alias("lemmatized")
)

# Count how many corrected terms were changed
changes_count = (en_corrections_low2["corrected"] != en_corrections_lowLemmatized["lemmatized"]).sum()
print(f"Total terms changed by spaCy lemmatization: {changes_count}")

spaCy lemmatization samples:
folders -> folder
the Albanian refugees -> the Albanian refugee
Americans -> Americans
Syrian children -> Syrian child
formalities -> formality
French folders -> French folder
French language -> French language
Syrian locations -> Syrian location
burkina-be -> burkina-be
world war (1950-1953) -> world war (1950-1953)
Mi'kmaq language -> Mi'kmaq language
Total terms changed by spaCy lemmatization: 3133


In [14]:
# Export the lemmatized corrections
en_corrections_lowLemmatized.write_excel("data/en_corrections_processed_lemmatized.xlsx")

<xlsxwriter.workbook.Workbook at 0x29195c69400>

# Agrovoc check

In [24]:
agrovoc_path = "data/agrovoc_enhanced.xlsx"
agrovoc_df = pl.read_excel(agrovoc_path)

# Create a lowercase lookup for Agrovoc terms
agrovoc_lookup = {str(term).lower(): term for term in agrovoc_df["English"].to_list() if term and str(term).strip()}

def agrovoc_case_fix(term):
    key = str(term).strip().lower()
    return agrovoc_lookup.get(key, term)

# Apply Agrovoc case fix to the "corrected" column
en_corrections_Agrocheck = en_corrections_Agrocheck.with_columns(
    pl.col("corrected").map_elements(
        agrovoc_case_fix,
        return_dtype=pl.Utf8
    ).alias("corrected")
)

# Count how many terms were changed by Agrovoc case fix
agrovoc_changes_count = (
    en_corrections_low2["corrected"] != en_corrections_Agrocheck["corrected"]
).sum()
print(f"Total terms changed by Agrovoc case fix: {agrovoc_changes_count}")

# Save the Agrovoc-checked corrections
en_corrections_Agrocheck.write_excel("data/en_corrections_processed_agrovoc.xlsx")

Total terms changed by Agrovoc case fix: 51


<xlsxwriter.workbook.Workbook at 0x29184a627b0>

# Pluralization

In [4]:
agrovoc_path = "data/agrovoc_enhanced.xlsx"