# Translation with NLLB-200 using CTranslate2

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

# Loading the data

In [6]:
import os

data_path = "data"
src_lang = "spa_Latn"
tgt_lang = "eng_Latn"

lang_dict = { "spa_Latn": "Spanish", "eng_Latn": "English" } # TO EXTEND

#directory = os.path.join(data_path, "spanish")

#os.chdir(directory)
os.getcwd()

'c:\\Users\\Nelso\\Documents\\Glossary-NMT-LLM'

In [39]:
# Load Glossary
import os
glossary_file = "glossary_UNEP_202505.xlsx"
glossary_file = "UNBIS.xlsx"
glossary_path = os.path.join("data", glossary_file)

import polars as pl
#glossary_df = pl.read_excel(glossary_path, sheet_name="CombinedGlossaries")
glossary_df = pl.read_excel(glossary_path)

glossary_remove_list = ["n/a", "N/A", "NaN", "nan", "None", "none", "null", "Null", "(Not provided)", "Not provided", "not provided", "N/A (not applicable)", "-"]
count_rows = glossary_df.shape[0]
print(f"Initial number of rows in the glossary: {count_rows}")
# Set NaN to cells (any column) in the glossary df that are in the remove list
count_remove = glossary_df.select(pl.all().is_in(glossary_remove_list)).sum().sum()
glossary_df = glossary_df.with_columns([
    pl.when(pl.col(col).is_in(glossary_remove_list))
    .then(None)
    .otherwise(pl.col(col))
    .alias(col)
    for col in glossary_df.columns
])
new_count_remove = glossary_df.select(pl.all().is_null()).sum().sum()
print(f"Removed {count_remove} cells from the glossary. New count of NaN cells: {new_count_remove}")

count_rows_after = glossary_df.shape[0]
print(f"Number of rows in the glossary after removing NaN cells: {count_rows_after}")

Initial number of rows in the glossary: 7279
Removed shape: (1, 13)
┌────────┬─────────┬────────┬────────┬───┬───────┬───────┬───────┬───────┐
│ itemID ┆ English ┆ Arabic ┆ French ┆ … ┆ FRalt ┆ SPalt ┆ CHalt ┆ RUalt │
│ ---    ┆ ---     ┆ ---    ┆ ---    ┆   ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ u32    ┆ u32     ┆ u32    ┆ u32    ┆   ┆ u32   ┆ u32   ┆ u32   ┆ u32   │
╞════════╪═════════╪════════╪════════╪═══╪═══════╪═══════╪═══════╪═══════╡
│ 0      ┆ 0       ┆ 0      ┆ 0      ┆ … ┆ 0     ┆ 0     ┆ 0     ┆ 0     │
└────────┴─────────┴────────┴────────┴───┴───────┴───────┴───────┴───────┘ cells from the glossary. New count of NaN cells: shape: (1, 13)
┌────────┬─────────┬────────┬────────┬───┬───────┬───────┬───────┬───────┐
│ itemID ┆ English ┆ Arabic ┆ French ┆ … ┆ FRalt ┆ SPalt ┆ CHalt ┆ RUalt │
│ ---    ┆ ---     ┆ ---    ┆ ---    ┆   ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ u32    ┆ u32     ┆ u32    ┆ u32    ┆   ┆ u32   ┆ u32   ┆ u32   ┆ u32   │
╞════════╪═════════╪════════╪════════╪═══╪═

## Transform uppercase into lowercase and correct if wrong spelling (missing diactricis) for Spanish, Russian, French

#!pip install pyspellchecker

In [12]:
from spellchecker import SpellChecker

def word_spell_check(word, lang):
    """
    Check spelling of a word in specified language and return correction if needed.
    
    Args:
        word (str): Word to check
        lang (str): Language column name from glossary_df 
                   (English, Arabic, French, Spanish, Chinese, Russian, Portuguese, Swahili)
    
    Returns:
        dict: {
            'original': original word,
            'is_correct': boolean,
            'correction': suggested correction or None,
            'candidates': list of alternative suggestions
        }
    """
    
    # Language mapping from glossary column names to pyspellchecker language codes
    lang_mapping = {
        'English': 'en',
        'Arabic': 'ar', 
        'French': 'fr',
        'Spanish': 'es',
        'Chinese': None,  # Not supported by pyspellchecker
        'Russian': 'ru',
        'Portuguese': 'pt',
        'Swahili': None   # Not supported by pyspellchecker
    }
    
    # Handle empty or None input - corrected condition
    if not word or word is None or (isinstance(word, str) and word.strip() == ""):
        return {
            'original': word,
            'is_correct': None,
            'correction': None,
            'candidates': []
        }
    
    # Get language code
    lang_code = lang_mapping.get(lang)
    
    # Handle unsupported languages
    if lang_code is None:
        return {
            'original': word,
            'is_correct': None,
            'correction': f"Language '{lang}' not supported by spell checker",
            'candidates': []
        }
    
    try:
        # Initialize spell checker for the specific language
        spell = SpellChecker(language=lang_code)
        
        # Clean the word (remove extra spaces, convert to lowercase for checking)
        clean_word = word.strip().lower()
        
        # Check if word is correct
        is_correct = clean_word in spell
        
        # If incorrect, get suggestions
        if not is_correct:
            correction = spell.correction(clean_word)
            candidates = list(spell.candidates(clean_word))
        else:
            correction = None
            candidates = []
        
        return {
            'original': word,
            'is_correct': is_correct,
            'correction': correction,
            'candidates': candidates[:5]  # Limit to top 5 suggestions
        }
        
    except Exception as e:
        return {
            'original': word,
            'is_correct': None,
            'correction': f"Error: {str(e)}",
            'candidates': []
        }

def check_glossary_spelling(glossary_df, lang_column):
    """
    Apply spell checking to all words in a specific language column of the glossary.
    
    Args:
        glossary_df: Your glossary DataFrame
        lang_column: Column name to check (e.g., 'English', 'Spanish', etc.)
    
    Returns:
        DataFrame with spell check results
    """
    
    # Get non-null values from the specified column - corrected approach
    words_to_check = glossary_df.select(lang_column).filter(
        pl.col(lang_column).is_not_null() &
        pl.col(lang_column).str.strip_chars().str.len_chars() > 0 &
        pl.col(lang_column).str.contains(r'^[a-zA-ZÀ-ÿ\s]+$')  # Allow letters and accented chars
    )
    
    results = []
    # Extract the actual string values from the DataFrame
    for row in words_to_check.iter_rows():
        word = row[0]  # Get the first (and only) column value
        if word:  # Double-check it's not None
            result = word_spell_check(word.lower(), lang_column)
            result['column'] = lang_column
            results.append(result)
    
    return pl.DataFrame(results)

# Test individual words:
print("Testing spell checker:")
print(word_spell_check("hello", "English"))
print(word_spell_check("hapenning", "English"))  
print(word_spell_check("africa", "Spanish"))
print(word_spell_check("manana", "Spanish"))  # misspelled

Testing spell checker:
{'original': 'hello', 'is_correct': True, 'correction': None, 'candidates': []}
{'original': 'hapenning', 'is_correct': False, 'correction': 'happening', 'candidates': ['japanning', 'happening', 'apennine', 'penning']}
{'original': 'africa', 'is_correct': False, 'correction': 'áfrica', 'candidates': ['áfrica']}
{'original': 'manana', 'is_correct': False, 'correction': 'mañana', 'candidates': ['canana', 'mañana', 'manada', 'mangana', 'macana']}


In [12]:
from transformers import pipeline
import re
# Initialize the text correction pipeline
generator = pipeline("text2text-generation", model="sdadas/byt5-text-correction")

from tqdm.notebook import tqdm
import time

def sent_check(sentence, lang):
    """
    Check and correct a sentence using ByT5 text correction model.
    
    Args:
        sentence (str): Sentence to check and correct
        lang (str): Language column name from glossary_df 
                   (English, Arabic, French, Spanish, Chinese, Russian, Portuguese, Swahili)
    
    Returns:
        dict: {
            'original': original sentence,
            'corrected': corrected sentence,
            'changed': boolean indicating if correction was applied,
            'language_code': language code used for correction
        }
    """
    
    # Language mapping from glossary column names to ByT5 language codes
    lang_mapping = {
        'English': 'en',
        'Arabic': 'ar', 
        'French': 'fr',
        'Spanish': 'es',
        'Chinese': 'zh',
        'Russian': 'ru',
        'Portuguese': 'pt',
        'Swahili': 'sw'  # ByT5 supports more languages than pyspellchecker
    }

    lang_intro = {
    'English': 'The term is: ',
    'Arabic': 'المصطلح هو: ',         # "Al-mustalaḥ huwa" — correct for a conceptual term
    'French': 'Le terme est : ',       # "Terme" is used for conceptual terms
    'Spanish': 'El término es: ',      # "Término" is correct for a glossary/terminology entry
    'Chinese': '术语是：',              # "术语" (shùyǔ) means technical or conceptual term
    'Russian': 'Термин: ',             # "Термин" is used for conceptual/technical terms
    'Portuguese': 'O termo é: ',       # "Termo" is correct for a conceptual term
    'Swahili': 'Neno la istilahi ni: ' # More precise than just "Neno ni", which means "The word is"
}

    
    # Handle empty or None input
    if not sentence or sentence is None or (isinstance(sentence, str) and sentence.strip() == ""):
        return {
            'original': sentence,
            'corrected': sentence,
            'changed': False,
            'language_code': None
        }
    
    # Get language code
    lang_code = lang_mapping.get(lang)
    lang_intro_text = lang_intro.get(lang)
    
    # Handle unsupported languages (though ByT5 supports many)
    if lang_code is None:
        return {
            'original': sentence,
            'corrected': sentence,
            'changed': False,
            'language_code': f"Language '{lang}' not supported"
        }
    
    try:
        # Clean the sentence (remove extra spaces)
        clean_sentence = sentence.strip()
        
        # Format sentence with language code for ByT5
        formatted_input = f"<{lang_code}> {lang_intro_text}{clean_sentence.lower()}"
        
        # Apply text correction
        corrected_result = generator(formatted_input, max_new_tokens=512)
        corrected_sentence = corrected_result[0]['generated_text']
        if corrected_sentence[-1] == ".":
            corrected_sentence = corrected_sentence[:-1]
        # Remove intro text from corrected sent
        corrected_sentence = corrected_sentence.replace(lang_intro_text, "")
        corrected_sentence = corrected_sentence.strip()
        
        def correct_with_word_boundaries(text, geoname, correction):
            """Replace geoname with correction only at word boundaries"""
            # Create pattern that matches the word at proper boundaries
            pattern = r'\b{}\b'.format(re.escape(geoname))
            # For hyphenated words, also match at hyphens
            hyphen_pattern = r'(\-){}'.format(re.escape(geoname))
            # Replace at word boundaries
            result = re.sub(pattern, correction, text, flags=re.IGNORECASE)
            # Replace when preceded by hyphen
            result = re.sub(hyphen_pattern, r'\1{}'.format(correction), result, flags=re.IGNORECASE)
            return result
        
        # Apply word boundary correction for geonames
        # Apply correction with proper word boundaries
        for geoname, proper_form in geoname_case_mapping.items():
            corrected_sentence = correct_with_word_boundaries(corrected_sentence, geoname, proper_form)


        # Check if correction was applied
        changed = corrected_sentence.lower() != clean_sentence.lower()
        
        return {
            'original': sentence,
            'corrected': corrected_sentence,
            'changed': changed,
            'language_code': lang_code
        }
        
    except Exception as e:
        return {
            'original': sentence,
            'corrected': sentence,
            'changed': False,
            'language_code': f"Error: {str(e)}"
        }

def check_glossary_sents(glossary_df, lang_column):
    """
    Apply sentence-level text correction to all entries in a specific language column of the glossary.
    
    Args:
        glossary_df: Your glossary DataFrame
        lang_column: Column name to check (e.g., 'English', 'Spanish', etc.)
    
    Returns:
        DataFrame with sentence correction results
    """
    
    # Get non-null values from the specified column
    sentences_to_check = glossary_df.select(lang_column).filter(
    (pl.col(lang_column).is_not_null()) &
    (pl.col(lang_column).str.strip_chars().str.len_chars() > 2) &  # Minimum 3 characters
    (pl.col(lang_column).str.to_uppercase() == pl.col(lang_column))
)
    
    results = []
    # Extract the actual string values from the DataFrame
    for row in sentences_to_check.iter_rows():
        sentence = row[0]  # Get the first (and only) column value
        if sentence:  # Double-check it's not None
            result = sent_check(sentence, lang_column)
            result['column'] = lang_column
            results.append(result)
            print(f"Processed: {sentence}")
    
    return pl.DataFrame(results)

# Alternative batch processing function for better performance
def check_glossary_sents_batch(glossary_df, lang_column, batch_size=10, intro_prompt=False):
    """
    Apply sentence-level text correction in batches for better performance.
    
    Args:
        glossary_df: Your glossary DataFrame
        lang_column: Column name to check
        batch_size: Number of sentences to process at once
        intro_prompt: Whether to use intro prompts
    
    Returns:
        DataFrame with sentence correction results
    """
    
    # Language mapping
    lang_mapping = {
        'English': 'en', 'Arabic': 'ar', 'French': 'fr', 'Spanish': 'es',
        'Chinese': 'zh', 'Russian': 'ru', 'Portuguese': 'pt', 'Swahili': 'sw'
    }
    
    lang_intro = {
        'English': 'The term is ',
        'Arabic': 'المصطلح هو ',         
        'French': 'Le terme est  ',       
        'Spanish': 'El término es ',      
        'Chinese': '术语是：',              
        'Russian': 'Термин ',             
        'Portuguese': 'O termo é ',       
        'Swahili': 'Neno la istilahi ni ' 
    }

    # Get language code
    lang_code = lang_mapping.get(lang_column)
    lang_intro_text = lang_intro.get(lang_column)
    if not lang_code:
        print(f"Language '{lang_column}' not supported")
        return pl.DataFrame()
    
    # Get sentences to process
    sentences_to_check = glossary_df.select(lang_column).filter(
        (pl.col(lang_column).is_not_null()) &
        (pl.col(lang_column).str.strip_chars().str.len_chars() > 2) &  # Minimum 3 characters
        (pl.col(lang_column).str.to_uppercase() == pl.col(lang_column))
    )
    
    sentences = [row[0] for row in sentences_to_check.iter_rows() if row[0]]
    
    if not sentences:
        return pl.DataFrame()
    
    results = []
    
    # Calculate total number of batches for progress bar
    total_batches = (len(sentences) + batch_size - 1) // batch_size
    
    # Process in batches with notebook-optimized progress bar
    with tqdm(total=total_batches, 
              desc=f"Processing {lang_column}", 
              unit="batch",
              leave=True,
              ncols=400) as pbar:
        
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i + batch_size]
            
            # Format batch with language codes
            if intro_prompt:
                formatted_batch = [f"<{lang_code}> {lang_intro_text}{sent.strip().lower()}" for sent in batch]
            else:
                formatted_batch = [f"<{lang_code}> {sent.strip().lower()}." for sent in batch]
            
            try:
                # Apply correction to batch
                corrected_batch = generator(formatted_batch, max_new_tokens=512)
                
                # Process results
                for j, (original, corrected_result) in enumerate(zip(batch, corrected_batch)):
                    corrected = corrected_result['generated_text']
                    changed = corrected.lower() != original.strip().lower()

                    # Clean correction
                    if corrected[-1] == ".":
                        corrected = corrected[:-1]
                    # Remove intro text from corrected sent
                    corrected = corrected.replace(lang_intro_text, "")
                    corrected = corrected.strip()
                    
                    results.append({
                        'original': original,
                        'corrected': corrected,
                        'changed': changed,
                        'language_code': lang_code,
                        'column': lang_column
                    })
                    
            except Exception as e:
                # If batch fails, process individually
                for sent in batch:
                    result = sent_check(sent, lang_column)
                    result['column'] = lang_column
                    results.append(result)
            
            # Update progress bar with additional info
            pbar.update(1)
            pbar.set_postfix({
                'sentences': len(results),
                'batch_size': len(batch),
                'changes': sum(1 for r in results if r.get('changed', False))
            })
    
    return pl.DataFrame(results)


Device set to use cuda:0


In [13]:
# dataset version
def check_glossary_sents_batch(glossary_df, lang_column, batch_size=10, intro_prompt=False):
    """
    Apply sentence-level text correction in batches for better performance.
    """
    # Language mapping
    lang_mapping = {
        'English': 'en', 'Arabic': 'ar', 'French': 'fr', 'Spanish': 'es',
        'Chinese': 'zh', 'Russian': 'ru', 'Portuguese': 'pt', 'Swahili': 'sw'
    }
    
    lang_intro = {
        'English': 'I am quite interested in ',
        'Arabic': 'المصطلح هو ',         
        'French': 'Le terme est  ',       
        'Spanish': 'El término es ',      
        'Chinese': '术语是：',              
        'Russian': 'Термин ',             
        'Portuguese': 'O termo é ',       
        'Swahili': 'Neno la istilahi ni ' 
    }

    # Get language code
    lang_code = lang_mapping.get(lang_column)
    lang_intro_text = lang_intro.get(lang_column)
    if not lang_code:
        print(f"Language '{lang_column}' not supported")
        return pl.DataFrame()
    
    # Get sentences to process
    sentences_to_check = glossary_df.select(lang_column).filter(
        (pl.col(lang_column).is_not_null()) &
        (pl.col(lang_column).str.strip_chars().str.len_chars() > 2) &
        (pl.col(lang_column).str.to_uppercase() == pl.col(lang_column))
    )
    
    sentences = [row[0] for row in sentences_to_check.iter_rows() if row[0]]
    
    if not sentences:
        return pl.DataFrame()
    
    results = []
    
    # Calculate total number of batches for progress bar
    total_batches = (len(sentences) + batch_size - 1) // batch_size
    
    # Process in batches with notebook-optimized progress bar
    with tqdm(total=total_batches, 
              desc=f"Processing {lang_column}", 
              unit="batch",
              leave=True,
              ncols=400) as pbar:
        
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i + batch_size]
            
            # Format batch with language codes
            if intro_prompt:
                formatted_batch = [f"<{lang_code}> {lang_intro_text}{sent.strip().lower()}" for sent in batch]
            else:
                formatted_batch = [f"<{lang_code}> {sent.strip().lower()}." for sent in batch]
            
            try:
                # Apply correction to entire batch at once - this is the key change
                corrected_batch = generator(formatted_batch, 
                                          max_new_tokens=512,
                                          batch_size=len(formatted_batch),  # Process entire batch together
                                          clean_up_tokenization_spaces=True)
                
                # Process results
                for j, (original, corrected_result) in enumerate(zip(batch, corrected_batch)):
                    corrected = corrected_result['generated_text']
                    changed = corrected.lower() != original.strip().lower()

                    # Clean correction
                    if corrected.endswith("."):
                        corrected = corrected[:-1]
                    # Remove intro text from corrected sent
                    corrected = corrected.replace(lang_intro_text, "")
                    corrected = corrected.strip()
                    
                    results.append({
                        'original': original,
                        'corrected': corrected,
                        'changed': changed,
                        'language_code': lang_code,
                        'column': lang_column
                    })
                    
            except Exception as e:
                # If batch fails, process individually as fallback
                print(f"Batch processing failed, falling back to individual processing: {e}")
                for sent in batch:
                    result = sent_check(sent, lang_column)
                    result['column'] = lang_column
                    results.append(result)
            
            # Update progress bar with additional info
            pbar.update(1)
            pbar.set_postfix({
                'sentences': len(results),
                'batch_size': len(batch),
                'changes': sum(1 for r in results if r.get('changed', False))
            })
    
    return pl.DataFrame(results)

In [19]:
import polars as pl

glossary_df = pl.read_excel("data/unbist-20250708.xlsx")

# Test the functions
print("Testing sentence checker:")
test_sentences = [
    ("hello world this is a test", "English"),
    ("hola mundo esto es una prueba", "Spanish"),
    ("BONJOUR MONDE CECI EST UN TEST", "French"),
    #("привет мир это тест", "Russian")
]

for sentence, lang in test_sentences:
    result = sent_check(sentence, lang)
    print(f"Original ({lang}): {result['original']}")
    print(f"Corrected: {result['corrected']}")
    print(f"Changed: {result['changed']}")
    print()

# Batch process Spanish column
#es_corrections = check_glossary_sents_batch(glossary_df, "Spanish", batch_size=20)
# Export to Excel
#es_corrections.write_excel("es_corrections.xlsx")

# Batch process french column
#fr_corrections = check_glossary_sents_batch(glossary_df, "French", batch_size=20)
# Export to Excel
#fr_corrections.write_excel("fr_corrections.xlsx")

en_corrections2 = check_glossary_sents_batch(glossary_df, "English", batch_size=20, intro_prompt=True)
#es_corrections

Testing sentence checker:
Original (English): hello world this is a test
Corrected: hello world this is a test
Changed: False

Original (Spanish): hola mundo esto es una prueba
Corrected: hola mundo esto es una prueba
Changed: False

Original (French): BONJOUR MONDE CECI EST UN TEST
Corrected: bonjour monde ceci est un test
Changed: False



Processing English:   0%|                                                                                     …

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [23]:
en_corrections2.write_excel("data/en_corrections2.xlsx")

<xlsxwriter.workbook.Workbook at 0x213f9ef4d10>

## Replace countries and international regions by capital letter


Total geographical names for exclusion: 586
Sample geonames: ['Saint Kitts and Nevis', 'Eastern Europe', 'Hayastani', 'Cuban', 'Portugal', 'Saint Pierre and Miquelon', 'Austria', 'Ukraine', 'Åland Islands', 'Macau']
Processing English corrections...
en_corrections2 has 7279 rows
Starting geoname case correction on 7279 terms...
Attempting vectorized processing...
Vectorized processing completed in 0.09 seconds
English corrections saved to data/en_corrections_processed2.xlsx with 7279 rows
Total corrections with changes: 3

First few rows of processed corrections:
shape: (5, 5)
┌───────────────────────────────┬──────────────────────────────┬─────────┬───────────────┬─────────┐
│ original                      ┆ corrected                    ┆ changed ┆ language_code ┆ column  │
│ ---                           ┆ ---                          ┆ ---     ┆ ---           ┆ ---     │
│ str                           ┆ str                          ┆ bool    ┆ str           ┆ str     │
╞═══════════

In [None]:
import requests
import json
from urllib.parse import quote
from rapidfuzz import fuzz
import time

def search_wikidata_fuzzy(query, language='en', min_score=80, max_results=5):
    """
    Search Wikidata with fuzzy matching for entities
    
    Args:
        query: Search term (case insensitive)
        language: Language code (default: 'en')
        min_score: Minimum fuzzy match score (default: 80)
        max_results: Maximum number of results to return
    
    Returns:
        List of matching entities with fuzzy scores
    """
    try:
        # Clean the query
        clean_query = query.strip()
        
        # Wikidata search API
        url = "https://www.wikidata.org/w/api.php"
        params = {
            'action': 'wbsearchentities',
            'format': 'json',
            'uselang': language,
            'search': clean_query,
            'language': language,
            'limit': 10,  # Get more results for fuzzy filtering
            'formatversion': 2
        }
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, params=params, headers=headers, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            
            if 'search' not in data:
                return []
            
            results = []
            query_lower = clean_query.lower()
            
            for item in data['search']:
                label = item.get('label', '')
                description = item.get('description', '')
                
                # Calculate fuzzy match score (case insensitive)
                score = fuzz.ratio(query_lower, label.lower())
                
                # Also check if it's an exact case-insensitive match
                exact_match = query_lower == label.lower()
                
                if score >= min_score or exact_match:
                    results.append({
                        'id': item.get('id', ''),
                        'label': label,
                        'description': description,
                        'score': score,
                        'exact_match': exact_match,
                        'url': f"https://www.wikidata.org/entity/{item.get('id', '')}"
                    })
            
            # Sort by score (descending) and exact matches first
            results.sort(key=lambda x: (x['exact_match'], x['score']), reverse=True)
            
            return results[:max_results]
            
        else:
            print(f"Wikidata API returned status code: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Wikidata search error for '{query}': {str(e)}")
        return []

def get_wikidata_multilingual_labels(entity_id, languages=['en', 'es', 'fr', 'de', 'it']):
    """
    Get labels in multiple languages for a Wikidata entity
    
    Args:
        entity_id: Wikidata entity ID (e.g., 'Q545')
        languages: List of language codes
    
    Returns:
        Dictionary with language codes as keys and labels as values
    """
    try:
        url = "https://www.wikidata.org/w/api.php"
        params = {
            'action': 'wbgetentities',
            'format': 'json',
            'ids': entity_id,
            'props': 'labels',
            'languages': '|'.join(languages),
            'formatversion': 2
        }
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, params=params, headers=headers, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            
            if 'entities' in data and entity_id in data['entities']:
                entity = data['entities'][entity_id]
                labels = entity.get('labels', {})
                
                result = {}
                for lang in languages:
                    if lang in labels:
                        result[lang] = labels[lang]['value']
                    else:
                        result[lang] = ''
                
                return result
            
        return {}
        
    except Exception as e:
        print(f"Error getting multilingual labels for {entity_id}: {str(e)}")
        return {}

def enhanced_wikidata_search(query, target_languages=['es', 'fr'], min_score=80):
    """
    Enhanced Wikidata search with multilingual results
    
    Args:
        query: Search term
        target_languages: Languages to get translations for
        min_score: Minimum fuzzy match score
    
    Returns:
        Dictionary with search results and translations
    """
    # Search for entities
    entities = search_wikidata_fuzzy(query, min_score=min_score)
    
    if not entities:
        return {
            'query': query,
            'found': False,
            'entities': []
        }
    
    # Get multilingual labels for the best matches
    enhanced_entities = []
    
    for entity in entities[:3]:  # Process top 3 matches
        entity_id = entity['id']
        
        # Get labels in target languages
        all_languages = ['en'] + target_languages
        multilingual_labels = get_wikidata_multilingual_labels(entity_id, all_languages)
        
        enhanced_entity = {
            **entity,
            'multilingual_labels': multilingual_labels
        }
        enhanced_entities.append(enhanced_entity)
        
        # Small delay to be respectful to the API
        time.sleep(0.1)
    
    return {
        'query': query,
        'found': True,
        'entities': enhanced_entities
    }

# Test the enhanced search
test_terms = [
    "Baltic Sea",
    "baltic sea", 
    "BALTIC SEA",
    "China",
    "United Nations",
    "climate change",
    "european Union",
    "european",
    "Albania"
]

print("Testing Enhanced Wikidata Search:")
print("=" * 50)

for term in test_terms:
    print(f"\nSearching for: '{term}'")
    result = enhanced_wikidata_search(term, target_languages=['es', 'fr'], min_score=70)
    
    if result['found']:
        print(f"Found {len(result['entities'])} entities:")
        for i, entity in enumerate(result['entities'], 1):
            print(f"  {i}. {entity['label']} (Score: {entity['score']}, Exact: {entity['exact_match']})")
            print(f"     Description: {entity['description']}")
            print(f"     Translations: {entity['multilingual_labels']}")
            print(f"     URL: {entity['url']}")
    else:
        print("No matches found")
    print("-" * 30)

Testing Enhanced Wikidata Search:

Searching for: 'Baltic Sea'
Found 3 entities:
  1. Baltic Sea (Score: 100.0, Exact: True)
     Description: sea in Northern Europe
     Translations: {'en': 'Baltic Sea', 'es': 'Mar Báltico', 'fr': 'mer Baltique'}
     URL: https://www.wikidata.org/entity/Q545
  2. Baltic Sea (Score: 100.0, Exact: True)
     Description: ship built in 2005
     Translations: {'en': 'Baltic Sea', 'es': '', 'fr': ''}
     URL: https://www.wikidata.org/entity/Q83600854
  3. Baltic Sea (Score: 100.0, Exact: True)
     Description: ship built in 1964
     Translations: {'en': 'Baltic Sea', 'es': '', 'fr': ''}
     URL: https://www.wikidata.org/entity/Q83647454
------------------------------

Searching for: 'baltic sea'
Found 3 entities:
  1. Baltic Sea (Score: 100.0, Exact: True)
     Description: sea in Northern Europe
     Translations: {'en': 'Baltic Sea', 'es': 'Mar Báltico', 'fr': 'mer Baltique'}
     URL: https://www.wikidata.org/entity/Q545
  2. Baltic Sea (Score: 1

In [None]:
word

[{'etymology': '',
  'definitions': [],
  'pronunciations': {'text': [], 'audio': []}}]

In [None]:
# set all to lowercase excepting Geonames

#load geonames
geonames_file_path = "data/M49_countries.xlsx"
geonames_df = pl.read_excel(geonames_file_path)
# Load the master glossary
master_glossary_df = pl.read_excel("data/glossary_UNEP_202505.xlsx")

# Get the list of Spanish geonames for exclusion
spanish_geonames = geonames_df.select("Spanish").filter(
    pl.col("Spanish").is_not_null()
).to_series().to_list()

french_geonames = geonames_df.select("French").filter(
    pl.col("French").is_not_null()
).to_series().to_list()

russian_geonames = geonames_df.select("Russian").filter(
    pl.col("Russian").is_not_null()
).to_series().to_list()

# For each value in 'corrected' column: if the first character is uppercase but the second is not, convert to lowercase
# BUT exclude geonames (don't convert them to lowercase)

# Check if es_corrections exists before creating es_corrections_low
if 'es_corrections' in locals():
    es_corrections_low = es_corrections.clone().with_columns(
        pl.when(
            (pl.col("corrected").str.slice(0, 1) == pl.col("corrected").str.slice(0, 1).str.to_uppercase()) &
            (pl.col("corrected").str.slice(1, 1) == pl.col("corrected").str.slice(1, 1).str.to_lowercase()) &
            (pl.col("corrected").str.len_chars() >= 2) &
            (~pl.col("corrected").is_in(spanish_geonames))  # Exclude geonames
        )
        .then(pl.col("corrected").str.to_lowercase())
        .otherwise(pl.col("corrected"))
        .alias("corrected")
    )
else:
    print("es_corrections not found. Skipping Spanish corrections.")

es_corrections_low = es_corrections_low.with_columns([
        pl.col("corrected").str.replace_all("Africa ", "África ").str.replace_all("^Africa$", "África").alias("corrected")
    ])

# Process French corrections (this should work since fr_corrections exists)
fr_corrections_low = fr_corrections.clone().with_columns(
    pl.when(
        (pl.col("corrected").str.slice(0, 1) == pl.col("corrected").str.slice(0, 1).str.to_uppercase()) &
        (pl.col("corrected").str.slice(1, 1) == pl.col("corrected").str.slice(1, 1).str.to_lowercase()) &
        (pl.col("corrected").str.len_chars() >= 2) &
        (~pl.col("corrected").is_in(french_geonames))  # Exclude geonames
    )
    .then(pl.col("corrected").str.to_lowercase())
    .otherwise(pl.col("corrected"))
    .alias("corrected")
)
#master_glossary_df = glossary_df

master_glossary_df = master_glossary_df.with_columns(
    pl.when(
        (pl.col("Russian").str.slice(0, 1) == pl.col("Russian").str.slice(0, 1).str.to_uppercase()) &
        (pl.col("Russian").str.slice(1, 1) == pl.col("Russian").str.slice(1, 1).str.to_lowercase()) &
        (pl.col("Russian").str.len_chars() >= 2) &
        (pl.col("Russian").is_not_null()) &  # Ensure not null
        (~pl.col("Russian").is_in(russian_geonames))  # Exclude geonames
    )
    .then(pl.col("Russian").str.to_lowercase())
    .otherwise(pl.col("Russian"))
    .alias("Russian")
)

print("Russian column lowercase transformation applied (excluding geonames).")




# Replace the original language columns with corrected ones by matching original values

# For Spanish corrections (if es_corrections_low exists)
if 'es_corrections_low' in locals():
    print("Processing Spanish corrections...")
    print(f"es_corrections_low has {es_corrections_low.shape[0]} rows")
    
    # Create a mapping from original to corrected values
    spanish_corrections_map = dict(zip(
        es_corrections_low.get_column("original").to_list(),
        es_corrections_low.get_column("corrected").to_list()
    ))
    
    # Apply corrections using map_elements
    master_glossary_df = master_glossary_df.with_columns([
        pl.col("Spanish").map_elements(
            lambda x: spanish_corrections_map.get(x, x) if x is not None else x,
            return_dtype=pl.Utf8
        ).alias("Spanish")
    ])
    print("Spanish corrections applied.")
else:
    print("Skipping Spanish corrections - es_corrections_low not available.")

# For French corrections
if 'fr_corrections_low' in locals():
    print("Processing French corrections...")
    print(f"fr_corrections_low has {fr_corrections_low.shape[0]} rows")
    
    # Create a mapping from original to corrected values
    french_corrections_map = dict(zip(
        fr_corrections_low.get_column("original").to_list(),
        fr_corrections_low.get_column("corrected").to_list()
    ))
    
    # Apply corrections using map_elements
    master_glossary_df = master_glossary_df.with_columns([
        pl.col("French").map_elements(
            lambda x: french_corrections_map.get(x, x) if x is not None else x,
            return_dtype=pl.Utf8
        ).alias("French")
    ])
    print("French corrections applied.")

# Save the updated glossary
master_glossary_df.write_excel("data/glossaryUNEP_corrected.xlsx")


print(f"Updated glossary saved with {master_glossary_df.shape[0]} rows")
print("Columns:", master_glossary_df.columns)


Russian column lowercase transformation applied (excluding geonames).
Processing Spanish corrections...
es_corrections_low has 7276 rows
Spanish corrections applied.
Processing French corrections...
fr_corrections_low has 7276 rows
French corrections applied.
Updated glossary saved with 5980 rows
Columns: ['Keyword', 'Category', 'English', 'Arabic', 'French', 'Spanish', 'Chinese', 'Russian', 'Portuguese', 'Swahili']


In [None]:
master_glossary_es = pl.read_excel("data/Keywords language versions in separate sheets.xlsx", sheet_name="ES")

# For Spanish corrections (if es_corrections_low exists)
if 'es_corrections_low' in locals():
    print("Processing Spanish corrections...")
    print(f"es_corrections_low has {es_corrections_low.shape[0]} rows")
    
    # Create a mapping from original to corrected values
    spanish_corrections_map = dict(zip(
        es_corrections_low.get_column("original").to_list(),
        es_corrections_low.get_column("corrected").to_list()
    ))
    
    # Apply corrections using map_elements
    master_glossary_es = master_glossary_es.with_columns([
        pl.col("UNBIS").map_elements(
            lambda x: spanish_corrections_map.get(x, x) if x is not None else x,
            return_dtype=pl.Utf8
        ).alias("UNBIS")
    ])
    print("Spanish corrections applied.")
else:
    print("Skipping Spanish corrections - es_corrections_low not available.")

master_glossary_fr = pl.read_excel("data/Keywords language versions in separate sheets.xlsx", sheet_name="FR")
# For French corrections
if 'fr_corrections_low' in locals():
    print("Processing French corrections...")
    print(f"fr_corrections_low has {fr_corrections_low.shape[0]} rows")
    
    # Create a mapping from original to corrected values
    french_corrections_map = dict(zip(
        fr_corrections_low.get_column("original").to_list(),
        fr_corrections_low.get_column("corrected").to_list()
    ))
    
    # Apply corrections using map_elements
    master_glossary_fr = master_glossary_fr.with_columns([
        pl.col("UNBIS").map_elements(
            lambda x: french_corrections_map.get(x, x) if x is not None else x,
            return_dtype=pl.Utf8
        ).alias("UNBIS")
    ])
    print("French corrections applied.")



master_glossary_ru = pl.read_excel("data/Keywords language versions in separate sheets.xlsx", sheet_name="RU")
master_glossary_ru = master_glossary_ru.with_columns(
    pl.when(
        (pl.col("UNBIS").str.slice(0, 1) == pl.col("UNBIS").str.slice(0, 1).str.to_uppercase()) &
        (pl.col("UNBIS").str.slice(1, 1) == pl.col("UNBIS").str.slice(1, 1).str.to_lowercase()) &
        (pl.col("UNBIS").str.len_chars() >= 2) &
        (pl.col("UNBIS").is_not_null()) &  # Ensure not null
        (~pl.col("UNBIS").is_in(russian_geonames))  # Exclude geonames
    )
    .then(pl.col("UNBIS").str.to_lowercase())
    .otherwise(pl.col("UNBIS"))
    .alias("UNBIS")
)
master_glossary_es.write_excel("data/Keywords language versions in separate sheets_corrected_ES.xlsx", worksheet="ES")
master_glossary_fr.write_excel("data/Keywords language versions in separate sheets_corrected_FR.xlsx", worksheet="FR")
master_glossary_ru.write_excel("data/Keywords language versions in separate sheets_corrected_RU.xlsx", worksheet="RU")

Could not determine dtype for column 21, falling back to string
Could not determine dtype for column 24, falling back to string


Processing Spanish corrections...
es_corrections_low has 7276 rows
Spanish corrections applied.


Could not determine dtype for column 21, falling back to string
Could not determine dtype for column 24, falling back to string


Processing French corrections...
fr_corrections_low has 7276 rows
French corrections applied.


Could not determine dtype for column 21, falling back to string
Could not determine dtype for column 24, falling back to string


<xlsxwriter.workbook.Workbook at 0x127108026c0>

In [None]:
# Debug: Check what columns exist in fr_corrections_low
print("Columns in fr_corrections_low:", fr_corrections_low.columns)
print("\nFirst few rows:")
print(fr_corrections_low.head())

Columns in fr_corrections_low: ['original', 'corrected', 'changed', 'language_code', 'column']

First few rows:
shape: (5, 5)
┌────────────────────┬────────────────────┬─────────┬───────────────┬────────┐
│ original           ┆ corrected          ┆ changed ┆ language_code ┆ column │
│ ---                ┆ ---                ┆ ---     ┆ ---           ┆ ---    │
│ str                ┆ str                ┆ bool    ┆ str           ┆ str    │
╞════════════════════╪════════════════════╪═════════╪═══════════════╪════════╡
│ ABACA              ┆ abaca              ┆ true    ┆ fr            ┆ French │
│ ENFANTS ABANDONNES ┆ enfants abandonnés ┆ true    ┆ fr            ┆ French │
│ ABREVIATIONS       ┆ abréviations       ┆ true    ┆ fr            ┆ French │
│ ENLEVEMENT         ┆ enlèvement         ┆ true    ┆ fr            ┆ French │
│ AVORTEMENT         ┆ avortement         ┆ true    ┆ fr            ┆ French │
└────────────────────┴────────────────────┴─────────┴───────────────┴────────┘


In [None]:
# Load translation memory

source_test_file = "all-filtered.es.real.test"
target_test_file = "all-filtered.en.real.test"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

Período de validez después de abierto el envase: 10 horas.
Shelf life after first opening the container: 10 hours.


In [None]:
# Load fuzzy matches from the Context Dataset

online_test_file = "all-filtered.esen.ms-multi-12.online.test"

src_lang = "spa_Latn"
tgt_lang = "eng_Latn"

with open(online_test_file, encoding="utf-8") as online:
  lines = [line.strip().split(" ||| ") for line in online.readlines()]
  scores = [float(line[0].strip()) for line in lines]
  fuzzy_source_sentences = [line[1].strip() for line in lines]
  online_source_sentences = [line[2].strip() for line in lines]
  fuzzy_target_prefixes = [line[3].strip() for line in lines]

print(fuzzy_source_sentences[0])
print(online_source_sentences[0])
print(fuzzy_target_prefixes[0])

Período de validez después de abierto el envase: 4 semanas
Período de validez después de abierto el envase: 10 horas.
Shelf life after opening the immediate packaging: 4 weeks.


# Load the models

In [None]:
#!pip install ctranslate2 sentencepiece -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
!ls /content/models/ct2-nllb-200-3.3B-int8

ls: cannot access '/content/models/ct2-nllb-200-3.3B-int8': No such file or directory


In [None]:
# Example of converting an NLLB model to CTranslate2 with int8 quantization

#!ct2-transformers-converter --model facebook/nllb-200-1.3B --quantization int8 --output_dir /content/models/ct2-nllb-200-1.3B-int8

config.json:   0% 0.00/808 [00:00<?, ?B/s]config.json: 100% 808/808 [00:00<00:00, 6.21MB/s]
2025-07-13 21:29:58.635083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752442198.897765    7060 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752442198.964322    7060 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-13 21:29:59.546839: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
pytorch_model.bin: 100% 5.48G/5

In [None]:
# Download the SentencePiece model

#!wget https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model

--2025-07-13 21:16:43--  https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model
Resolving s3.amazonaws.com (s3.amazonaws.com)... 3.5.0.37, 52.217.87.206, 52.216.140.174, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|3.5.0.37|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4852054 (4.6M) [binary/octet-stream]
Saving to: ‘flores200_sacrebleu_tokenizer_spm.model’


2025-07-13 21:16:44 (9.30 MB/s) - ‘flores200_sacrebleu_tokenizer_spm.model’ saved [4852054/4852054]



In [9]:
import os

# [Modify] Set paths to the CTranslate2 and SentencePiece models
#!mkdir -p /content/models
#!cp -r /content/ct2-nllb* /content/models
drive = "../models"

ct_model_path = os.path.join(drive, "ct2-nllb-200-1.3B-int8")
sp_model_path = os.path.join(drive, "flores200_sacrebleu_tokenizer_spm.model")

In [10]:
import ctranslate2
import sentencepiece as spm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Load the CTranslate2 model
translator = ctranslate2.Translator(ct_model_path, device=device)

  import pkg_resources


In [11]:
sp.encode_as_pieces("English:")

src_lang = "eng_Latn"
tgt_lang = "spa_Latn"

# Translate (source sentences only)

In [12]:
source = """The UN Environment Programme (UNEP) and the Food and Agriculture Organization of the UN (FAO) have named the first World Restoration Flagships for this year, tackling pollution, unsustainable exploitation, and invasive species in three continents.
These initiatives are restoring almost five million hectares of marine ecosystems – an area about the size of Costa Rica, which, together with France, is hosting the 3rd UN Ocean Conference.

The three new flagships comprise restoration initiatives in the coral-rich Northern Mozambique Channel Region, more than 60 of Mexico’s islands and the Mar Menor in Spain, Europe’s first ecosystem with legal personhood.
The winning initiatives were announced at an event during the UN Ocean Conference in Nice, France, and are now eligible for UN support.

“After decades of taking the ocean for granted, we are witnessing a great shift towards restoration.
But the challenge ahead of us is significant and we need everyone to play their part,” said Inger Andersen, Executive Director of UNEP.
“These World Restoration Flagships show how biodiversity protection, climate action, and economic development are deeply interconnected.
To deliver our restoration goals, our ambition must be as big as the ocean we must protect.”

FAO Director-General QU Dongyu said: “The climate crisis, unsustainable exploitation practices and nature resources shrinking are affecting our blue ecosystems, harming marine life and threatening the livelihoods of dependent communities.
These new World Restoration Flagships show that halting and reversing degradation is not only possible, but also beneficial to planet and people."

The World Restoration Flagship awards are part of the UN Decade on Ecosystem Restoration – led by UNEP and FAO – which aims to prevent, halt, and reverse the degradation of ecosystems on every continent and in every ocean.
The awards track notable initiatives that support global commitments to restore one billion hectares – an area larger than China – by 2030."""

source_sents = [sent.strip() for sent in source.split("\n")]
print(*source_sents, sep="\n")

The UN Environment Programme (UNEP) and the Food and Agriculture Organization of the UN (FAO) have named the first World Restoration Flagships for this year, tackling pollution, unsustainable exploitation, and invasive species in three continents.
These initiatives are restoring almost five million hectares of marine ecosystems – an area about the size of Costa Rica, which, together with France, is hosting the 3rd UN Ocean Conference.

The three new flagships comprise restoration initiatives in the coral-rich Northern Mozambique Channel Region, more than 60 of Mexico’s islands and the Mar Menor in Spain, Europe’s first ecosystem with legal personhood.
The winning initiatives were announced at an event during the UN Ocean Conference in Nice, France, and are now eligible for UN support.

“After decades of taking the ocean for granted, we are witnessing a great shift towards restoration.
But the challenge ahead of us is significant and we need everyone to play their part,” said Inger Ande

In [13]:
src_lang = "eng_Latn"
tgt_lang = "spa_Latn"

beam_size = 2

# Replace special characters in source_sents, like “,«, –
source_sents = [sent.replace("“", '"').replace("”", '"') for sent in source_sents]
source_sents = [sent.replace("–", "-") for sent in source_sents]

# Remove empty string from source_sents
source_sents = [sent.strip() for sent in source_sents if sent.strip()]
target_prefix = [[tgt_lang]] * len(source_sents)

# Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

# Translate the source sentences
translations = translator.translate_batch(source_sents_subworded,
                                          batch_type="tokens",
                                          max_batch_size=2024,
                                          beam_size=beam_size,
                                          target_prefix=target_prefix)
translations = [translation.hypotheses[0] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]

print(*translations_desubword[:10], sep="\n")

El Programa de las Naciones Unidas para el Medio Ambiente (PNUMA) y la Organización de las Naciones Unidas para la Alimentación y la Agricultura (FAO) han nombrado los primeros buques insignia de restauración mundial para este año, que abordan la contaminación, la explotación insostenible y las especies invasoras en tres continentes.
Estas iniciativas están restaurando casi cinco millones de hectáreas de ecosistemas marinos, un área de aproximadamente el tamaño de Costa Rica, que, junto con Francia, acoge la 3a Conferencia de las Naciones Unidas sobre los Océanos.
Los tres nuevos buques insignia incluyen iniciativas de restauración en la región del canal de Mozambique, rica en corales, más de 60 de las islas de México y el Mar Menor en España, el primer ecosistema europeo con personalidad jurídica.
Las iniciativas ganadoras se anunciaron en un evento durante la Conferencia de las Naciones Unidas sobre el Océano en Niza, Francia, y ahora son elegibles para el apoyo de las Naciones Unida

In [27]:
# Save the translations
with open("testUNEP.en", "w+") as output:
  for translation in translations_desubword:
    output.write(translation + "\n")

# Fuzzy search indexer TODO

In [5]:
import pandas as pd
import re
from typing import List, Dict, Optional, Union, Callable, Any, Tuple
from rapidfuzz import fuzz, process, utils
import numpy as np


class MultilingualGlossaryProcessor:
    """
    A class for processing multilingual glossaries using RapidFuzz for fuzzy string matching.
    """
    
    def __init__(self, glossary_path: str):
        """
        Initialize the processor with a glossary file.
        
        Args:
            glossary_path: Path to CSV/Excel file with columns:
                          Keyword, Category, English, Arabic, French, Spanish, Chinese, Russian, Portuguese, Swahili
        """
        if glossary_path.endswith('.xlsx') or glossary_path.endswith('.xls'):
            self.glossary = pd.read_excel(glossary_path)
        else:
            self.glossary = pd.read_csv(glossary_path)
        
        # Define available languages
        self.languages = ['English', 'Arabic', 'French', 'Spanish', 'Chinese', 'Russian', 'Portuguese', 'Swahili']
        
        # Validate glossary structure
        required_columns = ['Keyword', 'Category'] + self.languages
        missing_columns = [col for col in required_columns if col not in self.glossary.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")
    
    def find_best_fuzzy_match(
        self,
        query: str,
        source_language: str,
        target_languages: List[str],
        scorer: Callable = fuzz.WRatio,
        processor: Optional[Callable] = None,
        score_cutoff: Optional[float] = 60.0,
        process_method: str = "extractOne"
    ) -> Dict[str, Union[str, Dict[str, str]]]:
        """
        Find the best fuzzy match in the glossary for a given query.
        
        Args:
            query: The text to search for
            source_language: Language of the query
            target_languages: List of target languages to return translations
            scorer: RapidFuzz scorer function (default: fuzz.WRatio)
            processor: Text preprocessing function (default: None)
            score_cutoff: Minimum similarity score (default: 60.0)
            process_method: RapidFuzz process method ("extractOne", "extract", "cdist", "cpdist")
            
        Returns:
            Dictionary with best_fuzzy match and translations in target languages
        """
        if source_language not in self.languages:
            raise ValueError(f"Source language '{source_language}' not supported. Available: {self.languages}")
        
        invalid_targets = [lang for lang in target_languages if lang not in self.languages]
        if invalid_targets:
            raise ValueError(f"Invalid target languages: {invalid_targets}. Available: {self.languages}")
        
        # Get all terms in source language (excluding NaN values)
        source_terms = self.glossary[source_language].dropna().tolist()
        
        if not source_terms:
            return {"best_fuzzy": "", "result": {}}
        
        # Find best match using specified process method
        if process_method == "extractOne":
            result = process.extractOne(
                query, 
                source_terms, 
                scorer=scorer, 
                processor=processor, 
                score_cutoff=score_cutoff
            )
        elif process_method == "extract":
            results = process.extract(
                query, 
                source_terms, 
                scorer=scorer, 
                processor=processor, 
                limit=1, 
                score_cutoff=score_cutoff
            )
            result = results[0] if results else None
        elif process_method == "cdist":
            # Using cdist for single query
            distances = process.cdist(
                [query], 
                source_terms, 
                scorer=scorer, 
                processor=processor, 
                score_cutoff=score_cutoff
            )
            if distances.size > 0:
                best_idx = np.argmax(distances[0])
                if distances[0][best_idx] >= (score_cutoff or 0):
                    result = (source_terms[best_idx], distances[0][best_idx], best_idx)
                else:
                    result = None
            else:
                result = None
        elif process_method == "cpdist":
            # cpdist requires equal length arrays, so we'll use the query repeated
            if len(source_terms) > 0:
                distances = process.cpdist(
                    [query] * len(source_terms), 
                    source_terms, 
                    scorer=scorer, 
                    processor=processor, 
                    score_cutoff=score_cutoff
                )
                if distances.size > 0:
                    best_idx = np.argmax(distances)
                    if distances[best_idx] >= (score_cutoff or 0):
                        result = (source_terms[best_idx], distances[best_idx], best_idx)
                    else:
                        result = None
                else:
                    result = None
            else:
                result = None
        else:
            raise ValueError(f"Unsupported process method: {process_method}")
        
        if not result:
            return {"best_fuzzy": "", "result": {}}
        
        best_match, score, index = result
        
        # Find the row containing this match
        match_row = self.glossary[self.glossary[source_language] == best_match].iloc[0]
        
        # Get translations for target languages
        translations = {}
        for lang in target_languages:
            translation = match_row[lang]
            if pd.notna(translation):
                translations[lang] = str(translation)
            else:
                translations[lang] = ""
        
        return {
            "best_fuzzy": best_match,
            "score": score,
            "result": translations
        }
    
    def _remove_overlapping_matches(self, matches: List[Dict]) -> List[Dict]:
        """
        Remove overlapping matches, keeping the longest/highest scoring ones.
        
        Args:
            matches: List of match dictionaries with 'start', 'end', 'score', etc.
            
        Returns:
            Filtered list with non-overlapping matches
        """
        if not matches:
            return []
        
        # Sort by length (descending) then by score (descending)
        sorted_matches = sorted(matches, 
                              key=lambda x: (x['end'] - x['start'], x['score']), 
                              reverse=True)
        
        final_matches = []
        used_positions = set()
        
        for match in sorted_matches:
            # Check if this match overlaps with any already selected match
            match_positions = set(range(match['start'], match['end']))
            
            if not match_positions.intersection(used_positions):
                # No overlap, add this match
                final_matches.append(match)
                used_positions.update(match_positions)
        
        # Sort final matches by position in text
        final_matches.sort(key=lambda x: x['start'])
        return final_matches
    
    def find_all_fuzzy_matches_in_text(
        self,
        text: str,
        source_language: str,
        target_languages: List[str],
        scorer: Callable = fuzz.partial_ratio,
        processor: Optional[Callable] = None,
        score_cutoff: Optional[float] = 80.0,
        min_word_length: int = 1,
        limit: int = None
    ) -> List[Dict[str, Union[str, Dict[str, str]]]]:
        """
        Find all glossary terms that fuzzy match within a given text using extract method and token_set_ratio.
        Efficiently searches for glossary entries in the text and handles overlapping matches.
        
        Args:
            text: Input text to search within
            source_language: Language of the input text
            target_languages: List of target languages to return translations
            scorer: RapidFuzz scorer function (default: fuzz.partial_ratio)
            processor: Text preprocessing function (default: None)
            score_cutoff: Minimum similarity score (default: 80.0)
            min_word_length: Minimum length of words to consider (default: 2)
            limit: Maximum number of matches to return (default: None for all matches)
            
        Returns:
            List of dictionaries with found matches and their translations
        """
        if source_language not in self.languages:
            raise ValueError(f"Source language '{source_language}' not supported. Available: {self.languages}")
        
        invalid_targets = [lang for lang in target_languages if lang not in self.languages]
        if invalid_targets:
            raise ValueError(f"Invalid target languages: {invalid_targets}. Available: {self.languages}")
        
        # Get all terms in source language (excluding NaN values)
        source_terms = self.glossary[source_language].dropna().tolist()
        
        if not source_terms:
            return []
        
        # Filter terms by minimum word length
        filtered_terms = [term for term in source_terms if len(str(term).strip()) >= min_word_length]
        
        if not filtered_terms:
            return []
        
        # Sort glossary terms by length (longest first) for better matching
        source_terms_sorted = sorted(filtered_terms, key=len, reverse=True)
        
        all_matches = []
        
        # Use extract method with token_set_ratio as the only scorer
        extract_results = process.extract(
            text,
            source_terms_sorted,
            scorer=scorer,
            processor=processor,
            score_cutoff=score_cutoff,
            limit=limit
        )
        
        if not extract_results:
            return []
        
        #print(f"Found {len(extract_results)} matches in text using extract method with token_set_ratio")
        
        # Process each match result
        for match_term, similarity, _ in extract_results:
            # Find the row in glossary containing this match
            match_rows = self.glossary[self.glossary[source_language] == match_term]
            
            if match_rows.empty:
                continue
                
            match_row = match_rows.iloc[0]
            
            # Get translations for target languages
            translations = {}
            for lang in target_languages:
                translation = match_row[lang] if lang in match_row else None
                if pd.notna(translation):
                    translations[lang] = str(translation)
                else:
                    translations[lang] = ""
            
            # Find approximate positions in text for overlap detection
            # Using case-insensitive search to find the term in text
            text_lower = text.lower()
            term_lower = match_term.lower()
            
            # Try to find the exact match position
            start_pos = text_lower.find(term_lower)
            if start_pos != -1:
                end_pos = start_pos + len(match_term)
            else:
                # If exact match not found, use fuzzy position estimation
                # Split text into words and try to find approximate position
                words = text.split()
                best_match_idx = 0
                best_score = 0
                
                term_words = match_term.split()
                term_length = len(term_words)
                
                # Search for best matching position using sliding window
                for i in range(len(words) - term_length + 1):
                    window_text = " ".join(words[i:i + term_length])
                    window_score = fuzz.token_set_ratio(window_text.lower(), term_lower)
                    if window_score > best_score:
                        best_score = window_score
                        best_match_idx = i
                
                # Calculate approximate positions based on best match
                if best_match_idx < len(words):
                    words_before = " ".join(words[:best_match_idx])
                    start_pos = len(words_before) + (1 if words_before else 0)
                    
                    matched_words = words[best_match_idx:best_match_idx + term_length]
                    end_pos = start_pos + len(" ".join(matched_words))
                else:
                    start_pos = 0
                    end_pos = len(match_term)
            
            # Extract the actual text segment that was matched
            if start_pos >= 0 and end_pos <= len(text):
                found_text = text[start_pos:end_pos]
            else:
                found_text = match_term  # Fallback to the glossary term
            
            all_matches.append({
                "found_in_text": found_text,
                "best_fuzzy": match_term,
                "score": similarity,
                "result": translations,
                "start": start_pos,
                "end": end_pos
            })
        
        # Remove overlapping matches (prefer longer and higher scoring matches)
        final_matches = self._remove_overlapping_matches(all_matches)
        
        # Remove position information from final output and sort by score
        result_matches = []
        for match in final_matches:
            result_match = {k: v for k, v in match.items() if k not in ['start', 'end']}
            result_matches.append(result_match)
        
        # Sort by score (highest first)
        result_matches.sort(key=lambda x: x["score"], reverse=True)
        
        return result_matches

    def find_nearly_exact_english_matches(
        self,
        text: str,
        target_languages: List[str],
        score_cutoff: float = 95.0,
        normalize_text: bool = True,
        remove_overlaps: bool = True
    ) -> List[Dict[str, Union[str, Dict[str, str]]]]:
        """
        Find nearly-exact matches for English glossary terms in the given text.
        Optionally normalizes the text and glossary terms before matching.
        
        Args:
            text: Input English text
            target_languages: List of target languages to return translations
            score_cutoff: Minimum similarity score (default: 95.0)
            normalize_text: Whether to normalize text and terms (default: True)
            remove_overlaps: Whether to remove overlapping matches (default: True)
        
        Returns:
            List of dictionaries with found matches and their translations
        """
        def normalize(s):
            s = str(s).strip()
            # Remove numerical substring at the start and its trailing space and punctuation
            s = re.sub(r'^\d+\s*', '', s) # Example: "123 term" -> "term", "123. term" -> "term"
            s = re.sub(r'[^\w\s]', '', s) # Remove punctuation
            s = re.sub(r'\s+', ' ', s) # Normalize whitespace
            return s
        
        if "English" not in self.languages:
            raise ValueError("English language not available in glossary.")
        
        invalid_targets = [lang for lang in target_languages if lang not in self.languages]
        if invalid_targets:
            raise ValueError(f"Invalid target languages: {invalid_targets}. Available: {self.languages}")
        
        # Get all English terms
        english_terms = self.glossary["English"].dropna().tolist()
        if not english_terms:
            return []
        
        # Sort terms by length (longest first) for better matching priority
        english_terms_sorted = sorted(english_terms, key=len, reverse=True)
        
        all_matches = []
        
        # For each glossary term, try to find it in the text
        for orig_term in english_terms_sorted:
            # Normalize term if requested
            if normalize_text:
                search_term = normalize(orig_term)
                search_text = normalize(text)
            else:
                search_term = orig_term.strip()
                search_text = text.strip()
            
            # Split into words for position tracking
            term_words = search_term.split()
            if not term_words:
                continue
            
            # Find all possible matches in the text
            text_words = search_text.split()
            term_length = len(term_words)
            
            for i in range(len(text_words) - term_length + 1):
                # Get n-gram from text
                ngram_words = text_words[i:i + term_length]
                ngram_text = " ".join(ngram_words)
                
                # Calculate similarity using token_set_ratio for better subset matching
                score = fuzz.token_set_ratio(ngram_text, search_term)
                
                if score >= score_cutoff:
                    # Find positions in original text
                    # This is approximate since we're working with normalized text
                    original_words = text.split()
                    if i < len(original_words) and i + term_length <= len(original_words):
                        # Get the original text segment
                        original_segment = " ".join(original_words[i:i + term_length])
                        
                        # Estimate positions (approximate)
                        start_pos = text.lower().find(original_segment.lower())
                        if start_pos == -1:
                            # Fallback: use word-based estimation
                            words_before = " ".join(original_words[:i])
                            start_pos = len(words_before) + (1 if words_before else 0)
                        end_pos = start_pos + len(original_segment)
                        
                        # Get translations
                        match_row = self.glossary[self.glossary["English"] == orig_term].iloc[0]
                        translations = {}
                        for lang in target_languages:
                            translation = match_row[lang]
                            if pd.notna(translation):
                                translations[lang] = str(translation)
                            else:
                                translations[lang] = ""
                        
                        all_matches.append({
                            "found_in_text": original_segment,
                            "best_fuzzy": orig_term,
                            "score": score,
                            "result": translations,
                            "start": start_pos,
                            "end": end_pos
                        })
        
        # Remove overlapping matches if requested
        if remove_overlaps:
            final_matches = self._remove_overlapping_matches(all_matches)
        else:
            final_matches = all_matches
        
        # Remove position information from final output and sort by score
        result_matches = []
        for match in final_matches:
            result_match = {k: v for k, v in match.items() if k not in ['start', 'end']}
            result_matches.append(result_match)
        
        # Sort by score (highest first)
        result_matches.sort(key=lambda x: x["score"], reverse=True)
        
        return result_matches


def create_processor_function(processor_type: str) -> Optional[Callable]:
    """
    Create a processor function based on the specified type.
    
    Args:
        processor_type: Type of processor ("none", "default", "custom")
        
    Returns:
        Processor function or None
    """
    if processor_type == "none":
        return None
    elif processor_type == "default":
        return utils.default_process
    elif processor_type == "custom":
        # Custom processor that handles special cases
        def custom_processor(text):
            if not text:
                return ""
            # Remove extra whitespace, keep alphanumeric and spaces
            processed = re.sub(r'^\d+\s*', '', str(text).strip()) # Example: "123 term" -> "term", "123. term" -> "term"
            #processed = re.sub(r'[^\w\s]', '', processed) # Remove punctuation
            processed = re.sub(r'\s+', ' ', processed)
            return processed
        return custom_processor
    else:
        raise ValueError(f"Unknown processor type: {processor_type}")


# Example usage and testing functions
def example_usage():
    """
    Example usage of the MultilingualGlossaryProcessor.
    """
    # Initialize processor (assuming you have a glossary file)
    processor = MultilingualGlossaryProcessor("data/glossaryUNEP_corrected.xlsx")
    
    # Example 1: Find best fuzzy match
    result1 = processor.find_best_fuzzy_match(
        query="UN Environment Program",
        source_language="English",
        target_languages=["French", "Spanish", "Arabic"],
        scorer=fuzz.WRatio,
        processor=utils.default_process,
        score_cutoff=70.0,
        process_method="extractOne"
    )
    print("Best match result:", result1)
    
    # Example 2: Find all matches in text with overlap handling
    text = "UN Environment Programme is sponsored by UNESCO."
    result2 = processor.find_all_fuzzy_matches_in_text(
        text=text,
        source_language="English",
        target_languages=["French", "Spanish"],
        processor=utils.default_process,
        score_cutoff=80.0
    )
    print(f"All matches in text (no overlaps), {len(result2)} results in total:", result2)
    
    # Example 3: Nearly-exact English matches with normalization
    english_text = "The UN Environment Programme and UNEA are working on the International Day of Women Judge with organizations and developing new policies."
    result3 = processor.find_nearly_exact_english_matches(
        text=english_text,
        target_languages=["French", "Spanish"],
        score_cutoff=95.0,
        normalize_text=True,
        remove_overlaps=True
    )
    print(f"Nearly-exact English matches, {len(result3)} in total:", result3)
    
    # Example 4: More complex text
    complex_text = "This year, the United Nations Environment Programme (UNEP) and the SSC (South-South cooperation) are presiding the COP on Climate Change to address persistent organic pollutants before the UNEA7 with FAO and UNESCO, where the 1. total greenhouse gas emissions per year indicator is expected to be reduced by 50%."
    result4 = processor.find_all_fuzzy_matches_in_text(
        text=complex_text,
        source_language="English",
        target_languages=["French", "Spanish"],
        processor=utils.default_process,
        score_cutoff=95.0
    )
    print(f"Complex text matches RESULT #4, {len(result4)} results in total:", result4)

    # Example 5: Nearly-exact English matches with normalization
    english_text = complex_text
    result5 = processor.find_nearly_exact_english_matches(
        text=english_text,
        target_languages=["French", "Spanish"],
        score_cutoff=90.0,
        normalize_text=True,
        remove_overlaps=True
    )
    print(f"Nearly-exact English matches: {len(result5)} matches in total: ", result5)
    # print set of best_fuzzy and result['Spanish'] of result5
    bilingual_pairs = [(match['best_fuzzy'], match['result'].get('Spanish', '')) for match in result5]
    print("Bilingual pairs (best_fuzzy, Spanish translation):")
    for pair in bilingual_pairs:
        print(pair)


if __name__ == "__main__":
    example_usage()
        

Best match result: {'best_fuzzy': 'UN Environment Programme', 'score': 95.65217391304348, 'result': {'French': '', 'Spanish': 'Programa ONU Medio Ambiente', 'Arabic': ''}}
All matches in text (no overlaps), 2 results in total: [{'found_in_text': 'ore', 'best_fuzzy': 'ore', 'score': 100.0, 'result': {'French': 'minerais', 'Spanish': 'yacimientos minerales'}}, {'found_in_text': 'UN Environment Programme is', 'best_fuzzy': 'United Nations Environment Programme', 'score': 80.0, 'result': {'French': '', 'Spanish': 'Programa de las Naciones Unidas para el Medio Ambiente'}}]
Nearly-exact English matches, 3 in total: [{'found_in_text': 'UN Environment Programme', 'best_fuzzy': 'UN Environment Programme', 'score': 100.0, 'result': {'French': '', 'Spanish': 'Programa ONU Medio Ambiente'}}, {'found_in_text': 'UNEA', 'best_fuzzy': 'UNEA', 'score': 100.0, 'result': {'French': '', 'Spanish': ''}}, {'found_in_text': 'International Day of Women Judge', 'best_fuzzy': 'International Day of Women Judges'

In [8]:
text1 = "This year, the World Restoration Flagships of United Nations Environment Programme (UNEP) and the SSC (South-South cooperation) are presiding the COP on Climate Change where the Member States will address persistent organic pollutants before the UNEA7 with FAO and UNESCO, where the 1. Total greenhouse gas emissions per year indicator is expected to be reduced by 50%."    
text2 = "greenhouse gas emission"
text3 = "World Restoration Flagship"
text4 = "United States"

# Compare token ratio fuzzy
print(fuzz.token_set_ratio(text1, text2))
print(fuzz.token_set_ratio(text2, text1))
print(fuzz.token_set_ratio(text3, text1))
print(fuzz.token_set_ratio(text4, text1))
print(fuzz.partial_ratio(text1, text2))
print(fuzz.partial_ratio(text1, text4))
print(fuzz.partial_token_set_ratio(text1, text4))


75.67567567567568
75.67567567567568
79.06976744186046
100.0
100.0
69.23076923076923
100.0


In [48]:
story1 = """

Nairobi, 10 July 2025 – As extreme heat grips many countries and becomes “the new normal”, the UN Environment Programme (UNEP) warns of heightened health risks for older persons in the Frontiers 2025 Report published today. Other highlighted impacts of climate change include the melting of glaciers that reawaken ancient pathogens and floods that risk releasing dangerous chemicals. 

The 7th edition of the Frontiers Report, The Weight of Time - Facing a new age of challenges for people and ecosystems, is part of UNEP’s Foresight Trajectory initiative and highlights emerging environmental issues as well as potential solutions. The first edition in 2016, warned of the growing risk of zoonotic diseases, four years before the COVID-19 pandemic. This report is released as communities across China, Japan, India, Europe, USA and elsewhere face weeks of extreme heat and flooding. 

“Heat waves are among the most frequent and deadly impacts of climate change, along with floods and shrinking ice cover,” said Inger Andersen, Executive Director of UNEP. “We must be prepared for the risks these impacts pose, especially for society’s most vulnerable, including older persons. Yet as this year’s Frontiers Report shows, solutions exist that can help protect communities and restore ecosystems long-thought to have been lost.” 

Adults aged 65 and above now form an increasingly dominant part of the world population, particularly in urban areas of low- and middle-income countries. The report notes that annual heat-related deaths among older persons have risen by an estimated 85% since the 1990s. Additional risks arise from deteriorating air quality and floods in low-lying coastal cities where older persons live. 

Older persons — especially those with chronic illnesses, limited mobility, or frailty — are particularly vulnerable to heat-related health issues, including respiratory, cardiovascular, and metabolic diseases, as well as increased mortality. 

The report recommends making cities pollution-free, resilient, and accessible spaces with expansive vegetation. Key strategies include better urban planning, community-based disaster risk management, and improved access to climate information for older populations. 

Earlier this year, the UN Human Rights Council adopted a new resolution to develop an “international legally binding instrument on the human rights of older persons,” a possible path to add safety to those most exposed to climate change. 

Zombie microbes 

Beyond the risks to older persons, the report also warns of ancient microbes awakening. Should global temperatures rise more than 2˚C above pre-industrial levels, this would significantly reduce the cryosphere in mass, which includes glaciers, seasonal snow, ice sheets and shelves, sea ice, seasonally frozen ground, and permafrost. Cryospheric regions are home to 670 million people as well as to billions more who live in areas with water originating from those frozen areas. 

Dormant fungi, bacteria, and viruses in these frozen regions could reactivate, raising the risk of antimicrobial resistance. To slow down the decline of the cryosphere, the Frontiers 2025 Report recommends cutting greenhouse gas emissions – including black carbon emissions from diesel engines, open-field agricultural burning, and wildfires – and limiting tourism in fragile frozen regions. The report also recommends accelerating scientific research into the diversity of cryospheric microorganisms that will not survive the cryosphere’s decline. 

The return of banned chemicals through floods 

The report also identifies risks from the remobilization of chemicals that were banned and phased-out decades ago. Floods can bring such chemicals to the surface, after having accumulated in sediment over centuries. 

As floodwaters stir up sediment and debris, toxic chemicals may be released and re-enter urban areas or food systems. The report lists effective measures to reduce this growing risk: traditional control measures like polders, dikes and retention basins, improved drainage systems, nature-based solutions (e.g., sponge-city approaches), regular monitoring of pollutants in diverse locations and products, and economic impact studies about this type of pollution. 

The risk of ageing dams 

Another emerging threat the Frontiers 2025 Report addresses is the risk of ageing dams. Alongside many benefits, dams can harm indigenous and fishing-dependent communities, as well as degrade ecosystems. Removal of large, older dams that have become unsafe, obsolete, or economically unviable is increasingly happening in Europe and North America. 

The report highlights potential benefits of the removal of dams and barriers in restoring natural river connectivity for biodiversity and ecosystems. Reversing river fragmentation and restoring natural processes support the implementation of the UN’s principles for ecosystem-restoration initiatives . 

 

NOTES TO EDITORS 

About the UN Environment Programme (UNEP) 

UNEP is the leading global voice on the environment. It provides leadership and encourages partnership in caring for the environment by inspiring, informing and enabling nations and peoples to improve their quality of life without compromising that of future generations. 

For more information, please contact: 

News and Media Unit, UN Environment Programme 
"""

story1 = """

On the outskirts of Cape Town, South Africa, sits a bustling garment factory owned by Cape Union Mart, one of the country’s best-known outdoor apparel retailers.  

The plant produces about 190,000 puffer jackets a year and Cape Union Mart had been searching for ways to reduce its environmental footprint. So, in 2021 the company partnered with the United Nations Environment Programme (UNEP). The goal was to chart how much water and energy were going into each jacket, and figure out how to use fewer resources. 

This life cycle assessment revealed that more than 80 per cent of a jacket’s environmental impact comes from the production of the fabric it’s made from. That led Cape Union Mart to review its procurement criteria, said Pre-Production Manager Michelle Goddard. The company now only buys material from suppliers that meet stringent environmental standards. 

“Customers are definitely looking for more sustainable products and, being environmentally responsible makes sense for our brand,” says Goddard. “But even more than that, business as usual is no longer an option. Human activities have caused the Earth’s systems to exceed six of the nine planetary boundaries required for a healthy planet and there is an urgent need for companies to integrate sustainable practices.” 

The life cycle assessment was part of a larger UNEP effort to support small- and medium-sized textile manufacturers reduce their environmental impact. From Tunisia to South Africa, this UNEP programme has worked with clothing makers to cut greenhouse gas emissions, phase out toxic chemicals and reduce water use. The programme comes at a time when a growing number of experts raise the alarm about the environmental fallout from the fashion industry. Clothing production is a driver of climate change, a voracious consumer of raw materials, and in some places, a significant source of pollution. 

“The world is realizing that we need to change our approach to fashion and textiles,” says Elisa Tonda, Chief of UNEP’s Resources and Markets branch. “We need to design products to be reuseable, durable and recyclable in order to protect the planet from the effects of pollution and waste.”  

On the outskirts of Cape Town, South Africa, sits a bustling garment factory owned by Cape Union Mart, one of the country’s best-known outdoor apparel retailers. The plant produces about 190,000 puffer jackets a year and Cape Union Mart had been searching for ways to reduce its environmental footprint. So, in 2021 the company partnered with the United Nations Environment Programme (UNEP). The goal was to chart how much water and energy were going into each jacket, and figure out how to use fewer resources. This life cycle assessment revealed that more than 80 per cent of a jacket’s environmental impact comes from the production of the fabric it’s made from. That led Cape Union Mart to review its procurement criteria, said Pre-Production Manager Michelle Goddard. The company now only buys material from suppliers that meet stringent environmental standards. “Customers are definitely looking for more sustainable products and, being environmentally responsible makes sense for our brand,” says Goddard. “But even more than that, business as usual is no longer an option. Human activities have caused the Earth’s systems to exceed six of the nine planetary boundaries required for a healthy planet and there is an urgent need for companies to integrate sustainable practices.” The life cycle assessment was part of a larger UNEP effort to support small- and medium-sized textile manufacturers reduce their environmental impact. From Tunisia to South Africa, this UNEP programme has worked with clothing makers to cut greenhouse gas emissions, phase out toxic chemicals and reduce water use. The programme comes at a time when a growing number of experts raise the alarm about the environmental fallout from the fashion industry. Clothing production is a driver of climate change, a voracious consumer of raw materials, and in some places, a significant source of pollution. “The world is realizing that we need to change our approach to fashion and textiles,” says Elisa Tonda, Chief of UNEP’s Resources and Markets branch. “We need to design products to be reuseable, durable and recyclable in order to protect the planet from the effects of pollution and waste.” Name: b5afa1be-ffdf-4b21-97ab-51b5435f1686 Credit: Courtesy Cape Union Mart Description: Women using sewing machines Link: NA Caption: With support from UNEP, South Africa’s Cape Union Mart updated its procurement criteria and now only buys material from suppliers that meet strict environmental standards. Between 2000 and 2015, clothing production doubled globally, according to the Ellen MacArthur Foundation. This rapid growth has expanded the industry’s impact on the environment. Experts say the pain is felt more acutely in developing countries. One study published in the journal Springer Nature found that more than 15,000 chemicals are used in making textiles. As well, producing just 1 kilogramme of textiles takes over 0.5 kilogrammes of chemicals. To help change that, UNEP launched the InTex programme in 2020. Officially called the Innovative Business Practices and Economic Models in the Textile Value Chain, InText is funded by the European Union and Denmark. It focuses on small- and medium-sized enterprises, which are the vast majority of the world’s textile manufacturers, in five nations: India, Indonesia, Kenya, South Africa and Tunisia. InText is part of UNEP’s broader Textile Initiative, which aims to create a cleaner, more sustainable textile industry. At the core of InTex is the idea that the textile industry must move off its linear business model, where resources are gobbled up to make flimsy, disposable clothes. InTex touts a model where garments are made to last, material is recycled, and the use of resources – from water to chemicals – is kept to a minimum. This process is known as circularity and along with sparing the environment, it could generate up to US$700 billion in business opportunities by 2030, found the Ellen MacArthur Foundation. That potential is something Bilel Ben Miled knows well. He’s the head of sustainability at Tunisia’s Gonser Group, a clothing maker. With UNEP’s help, the company did a deep dive into the environmental footprint of the denim it produces in a factory outside of Tunis, the Tunisian capital. The factory produces 600,000 garments a year. Following the assessment, Gonser Group developed an energy management platform to track in real time the use of water, steam and gas in the factory. Ben Miled says the system, expected to launch later this year, will help the company spot waste and become more resource efficient, key in a country facing severe water shortages. Name: 20231019_111932 Credit: Courtesy Gonser Group Description: A factory floor with large washing machines Link: NA Caption: Tunisia’s Gonser Group, with support from UNEP, is evaluating an automated system that could cut the factory’s chemical use by 25 per cent. Gonser Group also plans to install 300 kilowatts of solar panels on the factory’s roof by 2026, which would cut electricity use by 40 per cent. And it’s evaluating an automated chemical dosing system that could cut the factory’s chemical use by 25 per cent. “We need to reduce our environmental footprint if we’re going to leave a better world for our children and the following generations,” says Ben Miled. The work with InTex is part of a broader sustainability push by the Gonser Group. Ben Miled says the company recycles up to 90 per cent of its water. The firm has also certified that more than 90 per cent of the chemicals it uses meet an environmental standard developed for apparel makers. Gonser Group is a major supplier to European brands, many of which are facing stricter environmental rules, especially on chemicals use. One of the big beneftis of UNEP’s InTex programme is that it helps companies improve efficiency, cut costs, and meet the growing demand from sustainable markets in the EU. “Brands are so engaged now when it comes to the environment,” Ben Miled says. “If we want to continue our business activities and expand our markets, we need to invest in sustainability.” UNEP’s Tonda agrees. “By considering sustainability and circularity in their procurement criteria, international brands can influence changes in the entire supply chain, decreasing the overall environmental impacts of production”. Since its launch in 2020, InTex has supported 32 small and medium enterprises develop roadmaps for reducing their environmental and socio-economic impact. Some 230 company representatives have been trained in circularity. The programme is now expanding to India and Indonesia, aiming to work with 60 more business and improve access to financing for small textile makers, which often struggle to get loans. The European Union has recognized InTex for its role in building a more sustainable global textiles industry. “This is an important programme because it shows clothes makers that it is possible to both protect the planet and bolster the bottom line,” says Tonda. “And it also demonstrates that investing in sustainability can give smaller businesses a competitive edge.” ### UNEP's work is made possible by flexible contributions from Member States and other partners to the Environment Fund  and UNEP Climate, Nature and Pollution funds. These funds enable agile, innovative solutions to climate change, nature and biodiversity loss, and pollution and waste. Learn how to support UNEP to invest in people and planet.
With support from UNEP, South Africa’s Cape Union Mart updated its procurement criteria and now only buys material from suppliers that meet strict environmental standards. Courtesy Cape Union Mart 

Between 2000 and 2015, clothing production doubled globally, according to the Ellen MacArthur Foundation. This rapid growth has expanded the industry’s impact on the environment. Experts say the pain is felt more acutely in developing countries. One study published in the journal Springer Nature found that more than 15,000 chemicals are used in making textiles. As well, producing just 1 kilogramme of textiles takes over 0.5 kilogrammes of chemicals. 

To help change that, UNEP launched the InTex programme in 2020. Officially called the Innovative Business Practices and Economic Models in the Textile Value Chain, InText is funded by the European Union and Denmark. It focuses on small- and medium-sized enterprises, which are the vast majority of the world’s textile manufacturers, in five nations: India, Indonesia, Kenya, South Africa and Tunisia. InText is part of UNEP’s broader Textile Initiative, which aims to create a cleaner, more sustainable textile industry.  

At the core of InTex is the idea that the textile industry must move off its linear business model, where resources are gobbled up to make flimsy, disposable clothes. InTex touts a model where garments are made to last, material is recycled, and the use of resources – from water to chemicals – is kept to a minimum. This process is known as circularity and along with sparing the environment, it could generate up to US$700 billion in business opportunities by 2030, found the Ellen MacArthur Foundation. 

That potential is something Bilel Ben Miled knows well. He’s the head of sustainability at Tunisia’s Gonser Group, a clothing maker. With UNEP’s help, the company did a deep dive into the environmental footprint of the denim it produces in a factory outside of Tunis, the Tunisian capital. The factory produces 600,000 garments a year. 

Following the assessment, Gonser Group developed an energy management platform to track in real time the use of water, steam and gas in the factory. Ben Miled says the system, expected to launch later this year, will help the company spot waste and become more resource efficient, key in a country facing severe water shortages.   

A factory floor with large washing machines 
Tunisia’s Gonser Group, with support from UNEP, is evaluating an automated system that could cut the factory’s chemical use by 25 per cent. Courtesy Gonser Group 

Gonser Group also plans to install 300 kilowatts of solar panels on the factory’s roof by 2026, which would cut electricity use by 40 per cent. And it’s evaluating an automated chemical dosing system that could cut the factory’s chemical use by 25 per cent. 

“We need to reduce our environmental footprint if we’re going to leave a better world for our children and the following generations,” says Ben Miled. 

The work with InTex is part of a broader sustainability push by the Gonser Group. Ben Miled says the company recycles up to 90 per cent of its water. The firm has also certified that more than 90 per cent of the chemicals it uses meet an environmental standard developed for apparel makers. 

Gonser Group is a major supplier to European brands, many of which are facing stricter environmental rules, especially on chemicals use. One of the big beneftis of UNEP’s InTex programme is that it helps companies improve efficiency, cut costs, and meet the growing demand from sustainable markets in the EU. 

“Brands are so engaged now when it comes to the environment,” Ben Miled says. “If we want to continue our business activities and expand our markets, we need to invest in sustainability.” 

UNEP’s Tonda agrees.  

“By considering sustainability and circularity in their procurement criteria, international brands can influence changes in the entire supply chain, decreasing the overall environmental impacts of production.” 

Since its launch in 2020, InTex has supported 32 small and medium enterprises develop roadmaps for reducing their environmental and socio-economic impact. Some 230 company representatives have been trained in circularity. The programme is now expanding to India and Indonesia, aiming to work with 60 more business and improve access to financing for small textile makers, which often struggle to get loans.  

The European Union has recognized InTex for its role in building a more sustainable global textiles industry. 

“This is an important programme because it shows clothes makers that it is possible to both protect the planet and bolster the bottom line,” says Tonda. “And it also demonstrates that investing in sustainability can give smaller businesses a competitive edge.” 

 

UNEP's work is made possible by flexible contributions from Member States and other partners to the Environment Fund  and UNEP Climate, Nature and Pollution funds. These funds enable agile, innovative solutions to climate change, nature and biodiversity loss, and pollution and waste. Learn how to support UNEP to invest in people and planet. 
"""

# Example 6: More complex text
processor = MultilingualGlossaryProcessor("data/glossaryUNEP_corrected.xlsx")
result7 = processor.find_all_fuzzy_matches_in_text(
    text=story1,
    source_language="English",
    target_languages=["French", "Spanish"],
    scorer=fuzz.token_set_ratio,
    processor=create_processor_function("custom"),
    score_cutoff=95.0
)
print("Complex text matches RESULT:", len(result7),"\n\n", result7)


Complex text matches RESULT: 67 

 [{'found_in_text': 'South Africa', 'best_fuzzy': 'South Africa', 'score': 100.0, 'result': {'French': 'Afrique du Sud', 'Spanish': 'Sudáfrica'}}, {'found_in_text': 'factory', 'best_fuzzy': 'factory', 'score': 100.0, 'result': {'French': 'usine', 'Spanish': 'fábrica'}}, {'found_in_text': '.  \n\nThe ', 'best_fuzzy': 'plant growth', 'score': 100.0, 'result': {'French': 'croissance de la plante', 'Spanish': 'Crecimiento de planta'}}, {'found_in_text': 'plant', 'best_fuzzy': 'plant', 'score': 100.0, 'result': {'French': 'plantes', 'Spanish': 'plantas'}}, {'found_in_text': 'environmental footprint', 'best_fuzzy': 'environmental footprint', 'score': 100.0, 'result': {'French': 'empreinte environnementale', 'Spanish': 'huella ambiental'}}, {'found_in_text': 'United Nations', 'best_fuzzy': 'United Nations', 'score': 100.0, 'result': {'French': 'Nations Unies', 'Spanish': 'naciones Unidas'}}, {'found_in_text': 'Programme', 'best_fuzzy': 'programme', 'score': 1

In [45]:
# Get list of str of key best_fuzzy
result7 = [match['best_fuzzy'] for match in result7 if 'best_fuzzy' in match]

print(result7)

['South Africa', 'factory', 'plant growth', 'plant', 'environmental footprint', 'United Nations', 'programme', 'energy demand', 'resources', 'life cycle', 'assessment', 'environmental impact of industry', 'production standard', 'material', 'environmental standard', 'sustainable production', 'business management', 'planet', 'impact assessment', 'textile', 'Tunisia', 'greenhouse gas', 'chemicals and waste', 'water resource', 'water use', 'clothing industry', 'impact of climate change', 'consumer waste', 'consumer', 'raw material', 'energy source', 'pollution', 'plant resource', 'design', 'chemical pollution', 'waste', 'Nature', 'programme management', 'European Union', 'India', 'textile industry', 'material management', 'water resources management', 'water supply', 'chemical process', 'process', 'environmental sustainability', 'sustainability', 'deep water', 'energy resource', 'toxic waste management', 'resource management', 'developing country', 'chemical industry', 'solar system', 'roo

In [33]:
from rapidfuzz import fuzz, process
two_words = "surface water management"
print(process.extract(story1, [two_words], scorer=fuzz.partial_ratio, limit=5))

# pretty print of result6
import pprint
pp = pprint.PrettyPrinter(indent=2)
#pp.pprint(result6)


import re
# Create a regex pattern to match first token in two_words until second token
re_pattern = rf"{re.escape(two_words.split()[0])}.*{re.escape(two_words.split()[-1])}"
re_patinverted = rf"{re.escape(two_words.split()[-1])}.*{re.escape(two_words.split()[0])}"
# Find all matches in the story
matches = re.findall(re_pattern, story1, flags=re.IGNORECASE)
matches += re.findall(re_patinverted, story1, flags=re.IGNORECASE)
matches = list(set(matches))
print("Matches found:", matches)


[('surface water management', 66.66666666666667, 0)]
Matches found: []


In [21]:
print(f"Number of matches found for token_set_radio: {len(result6)}")
print(f"Number of matches found for partial_ratio: {len(result7)}")
if result7==result6:
    print("Both methods found the same matches.")


Number of matches found for token_set_radio: 81
Number of matches found for partial_ratio: 81
Both methods found the same matches.


In [49]:
# Install NLTK if needed and import stemming functionality
try:
    import nltk
    from nltk.stem import PorterStemmer
    # Download required NLTK data if not present
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
except ImportError:
    print("Installing NLTK...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    import nltk
    from nltk.stem import PorterStemmer
    nltk.download('punkt')

# Initialize the stemmer
stemmer = PorterStemmer()

def check_best_fuzzy_in_story_with_stemming(result_list, story_text):
    """
    Check if the 'best_fuzzy' values from the input result list are contained in story1.
    Uses regex patterns to match both normal and inverted token order with stemming applied.
    
    Args:
        result_list: List of dictionaries containing 'best_fuzzy' keys
        story_text: The story text to search within
        
    Returns:
        Dictionary with match results for each best_fuzzy term
    """
    if not result_list:
        return {}
    
    match_results = {}
    
    for item in result_list:
        if 'best_fuzzy' not in item:
            continue
            
        best_fuzzy = item['best_fuzzy']
        words = best_fuzzy.split()
        
        if len(words) < 2:
            # For single words, apply stemming and do case-insensitive search
            original_word = words[0]
            stemmed_word = stemmer.stem(original_word.lower())
            
            # Create pattern that matches the stem with possible suffixes
            pattern = rf"\b{re.escape(stemmed_word)}\w*\b"
            matches = re.findall(pattern, story_text, flags=re.IGNORECASE)
            
            match_results[best_fuzzy] = {
                'found': bool(matches),
                'matches': matches,
                'patterns_used': ['stemmed_word_boundary'],
                'original_word': original_word,
                'stemmed_word': stemmed_word,
                'pattern_used': pattern
            }
        else:
            # For multi-word terms, apply stemming to first and last words
            first_word = words[0]
            last_word = words[-1]
            
            # Apply stemming
            stemmed_first = stemmer.stem(first_word.lower())
            stemmed_last = stemmer.stem(last_word.lower())
            
            # Create patterns using stemmed words with possible suffixes
            # Normal pattern: stemmed_first + suffixes ... (max 2 words) ... stemmed_last + suffixes
            re_pattern = rf"\b{re.escape(stemmed_first)}\w*(?:\s+\w+){{0,2}}\s+\b{re.escape(stemmed_last)}\w*\b"
            # Inverted pattern: stemmed_last + suffixes ... (max 2 words) ... stemmed_first + suffixes
            re_par_inverted = rf"\b{re.escape(stemmed_last)}\w*(?:\s+\w+){{0,2}}\s+\b{re.escape(stemmed_first)}\w*\b"

            # Also create exact word patterns for comparison with same word limit
            exact_pattern = rf"\b{re.escape(first_word)}(?:\s+\w+){{0,2}}\s+\b{re.escape(last_word)}\b"
            exact_inverted = rf"\b{re.escape(last_word)}(?:\s+\w+){{0,2}}\s+\b{re.escape(first_word)}\b"
            
            # Find matches with stemmed patterns
            stemmed_normal_matches = re.findall(re_pattern, story_text, flags=re.IGNORECASE)
            stemmed_inverted_matches = re.findall(re_par_inverted, story_text, flags=re.IGNORECASE)
            
            # Find matches with exact patterns for comparison
            exact_normal_matches = re.findall(exact_pattern, story_text, flags=re.IGNORECASE)
            exact_inverted_matches = re.findall(exact_inverted, story_text, flags=re.IGNORECASE)
            
            # Combine all matches and deduplicate
            all_stemmed_matches = list(set(stemmed_normal_matches + stemmed_inverted_matches))
            all_exact_matches = list(set(exact_normal_matches + exact_inverted_matches))
            
            match_results[best_fuzzy] = {
                'found': bool(all_stemmed_matches or all_exact_matches),
                'stemmed_matches': all_stemmed_matches,
                'exact_matches': all_exact_matches,
                'stemmed_normal_matches': stemmed_normal_matches,
                'stemmed_inverted_matches': stemmed_inverted_matches,
                'exact_normal_matches': exact_normal_matches,
                'exact_inverted_matches': exact_inverted_matches,
                'patterns_used': ['stemmed_normal', 'stemmed_inverted', 'exact_normal', 'exact_inverted'],
                'original_words': {'first': first_word, 'last': last_word},
                'stemmed_words': {'first': stemmed_first, 'last': stemmed_last},
                'stemmed_patterns': {'normal': re_pattern, 'inverted': re_par_inverted},
                'exact_patterns': {'normal': exact_pattern, 'inverted': exact_inverted}
            }
    
    return match_results

# Test the enhanced function with result6 and story1
print("Checking if best_fuzzy values from result6 are contained in story1 (WITH STEMMING):")
print("=" * 70)

fuzzy_check_results_stemmed = check_best_fuzzy_in_story_with_stemming(result7, story1)

for term, result in fuzzy_check_results_stemmed.items():
    print(f"\nTerm: '{term}'")
    print(f"Found in story: {result['found']}")
    
    if 'stemmed_words' in result:
        print(f"Original words: {result['original_words']}")
        print(f"Stemmed words: {result['stemmed_words']}")
    elif 'stemmed_word' in result:
        print(f"Original word: {result['original_word']}")
        print(f"Stemmed word: {result['stemmed_word']}")
    
    if result['found']:
        if 'stemmed_matches' in result:
            print(f"Stemmed matches: {len(result['stemmed_matches'])} - {result['stemmed_matches']}")
            print(f"Exact matches: {len(result['exact_matches'])} - {result['exact_matches']}")
        else:
            print(f"Matches: {len(result['matches'])} - {result['matches']}")
        print(f"Patterns used: {result['patterns_used']}")
    else:
        print("No matches found")
    print("-" * 50)

# Print totals
total_found = sum(1 for result in fuzzy_check_results_stemmed.values() if result['found'])
print(f"\nTotal terms found in story: {total_found} out of {len(result7)}")

# list of found terms in fuzzy_check_results_stemmed
found_terms = [term for term, result in fuzzy_check_results_stemmed.items() if result['found']]
print(f"Found terms: {found_terms}")

Checking if best_fuzzy values from result6 are contained in story1 (WITH STEMMING):

Term: 'South Africa'
Found in story: True
Original words: {'first': 'South', 'last': 'Africa'}
Stemmed words: {'first': 'south', 'last': 'africa'}
Stemmed matches: 1 - ['South Africa']
Exact matches: 1 - ['South Africa']
Patterns used: ['stemmed_normal', 'stemmed_inverted', 'exact_normal', 'exact_inverted']
--------------------------------------------------

Term: 'factory'
Found in story: False
Original word: factory
Stemmed word: factori
No matches found
--------------------------------------------------

Term: 'plant growth'
Found in story: False
Original words: {'first': 'plant', 'last': 'growth'}
Stemmed words: {'first': 'plant', 'last': 'growth'}
No matches found
--------------------------------------------------

Term: 'plant'
Found in story: True
Original word: plant
Stemmed word: plant
Matches: 2 - ['plant', 'plant']
Patterns used: ['stemmed_word_boundary']
------------------------------------

# Translate (with fuzzy matches)

In [54]:
similar_text = """El Programa de las Naciones Unidas para el Medio Ambiente (PNUMA) y la Organización de las Naciones Unidas para la Alimentación y la Agricultura (FAO) han nombrado las primeras Iniciativas Emblemáticas de la Restauración Mundial para este año, que abordan la degradación de los ecosistemas en todo el planeta.
Estas iniciativas han estado restaurando alrededor tres millones de hectáreas de ecosistemas marinos, un área del tamaño de El Salvador.
Las siete nuevas Iniciativas Emblemáticas comprenden iniciativas de restauración en Ecuador, Colombia, Kenya e Indonesia.
.
"Por mucho tiempo se ha dado por sentado el poder de los bosques, tan esenciales para la restauración.
Cada persona debe cumplir su parte", afirmó Inger Andersen, Directora Ejecutiva del PNUMA.
"Las Iniciativas Emblemáticas de la Restauración Mundial muestran cómo la protección de la biodiversidad, la acción climática y el desarrollo económico están profundamente interconectados.
Para lograr nuestros objetivos de restauración, nuestra ambición debe ser tan grande como el océano que debemos proteger".
El Director General de la FAO, QU Dongyu, manifestó: "La crisis climática, las prácticas de explotación insostenible y la reducción de los recursos naturales están afectando nuestros ecosistemas azules, dañando la vida marina y amenazando los medios de vida de las comunidades.
Estas nuevas 7 Iniciativas Emblemáticas muestran que detener y revertir la degradación es posible y beneficioso para el planeta y las personas"."""
fuzzy_sents = [sent.strip() for sent in similar_text.split("\n")]
fuzzy_target_prefixes = [sent.strip() for sent in fuzzy_sents if sent.strip()]

similar_text_en = """The United Nations Environment Programme (UNEP) and the Food and Agriculture Organization (FAO) have named the first World Restoration Flagships for this year, which address ecosystem degradation across the globe.
These initiatives have been restoring around three million hectares of marine ecosystems, an area the size of El Salvador.
The seven new flagships include restoration initiatives in Ecuador, Colombia, Kenya and Indonesia.
.
"The power of forests, so essential to restoration, has long been taken for granted.
Everyone must do their part," said Inger Andersen, Executive Director of UNEP.
"The World Restoration Flagships show how biodiversity protection, climate action and economic development are deeply interconnected.
To achieve our restoration goals, our ambition must be as big as the ocean we must protect."
FAO Director-General QU Dongyu said, "The climate crisis, unsustainable exploitation practices and depletion of natural resources are affecting our blue ecosystems, damaging marine life and threatening the livelihoods of communities.
These new 7 flagships show that halting and reversing degradation is possible and beneficial for the planet and people."
"""

#glossary_entry = ["World Restoration Flagships", "Iniciativas Emblemáticas de la Restauración Mundial"] #plural
#glossary_entry = ["World Restoration Flagship", "Iniciativa Emblemática de la Restauración Mundial"] #singular
#glossary_entry = ["world restoration flagship", "iniciativa emblemática de la restauración mundial"] #singular_lowercase

fuzzy_src_sents = [sent.strip() for sent in similar_text_en.split("\n")]
fuzzy_source_sentences = [sent.strip() for sent in fuzzy_src_sents if sent.strip()]

# Replace first and second elements of source and target texts with the glossary entry
#fuzzy_source_sentences[0] = glossary_entry[0]
#fuzzy_target_prefixes[0] = glossary_entry[1]

print("Length of fuzzy source and fuzzy target prefixes:")
print(len(fuzzy_source_sentences))
print(len(fuzzy_target_prefixes))

print(fuzzy_source_sentences[0])
print(fuzzy_target_prefixes[0])

Length of fuzzy source and fuzzy target prefixes:
10
10
The United Nations Environment Programme (UNEP) and the Food and Agriculture Organization (FAO) have named the first World Restoration Flagships for this year, which address ecosystem degradation across the globe.
El Programa de las Naciones Unidas para el Medio Ambiente (PNUMA) y la Organización de las Naciones Unidas para la Alimentación y la Agricultura (FAO) han nombrado las primeras Iniciativas Emblemáticas de la Restauración Mundial para este año, que abordan la degradación de los ecosistemas en todo el planeta.


## Alternative of fuzzy entries ready for xl8

In [52]:
processor = MultilingualGlossaryProcessor("data/glossaryUNEP_corrected.xlsx")

glossary_matches = []

for src_sent in source_sents:
    sent_matches = processor.find_nearly_exact_english_matches(
        text=src_sent,
        target_languages=["Spanish"],
        score_cutoff=90.0,
        normalize_text=True,
        remove_overlaps=True
    )

    # filter sent_matches as a list of str of best_fuzzy and results['Spanish'] if both are not empty
    sent_matches = [(match['best_fuzzy'], match['result'].get('Spanish', '')) for match in sent_matches if 'best_fuzzy' in match and 'Spanish' in match['result']]
    # remove set in sent_matches if best_fuzzy is empty or Spanish translation is empty
    sent_matches = [match for match in sent_matches if match[0] and match[1]]
    if sent_matches:
        print(f"Matches found for source sentence '{src_sent}': {len(sent_matches)}"
              f" - {sent_matches}")
        glossary_matches.append(sent_matches)
    else:
        # add empty tuple if no matches found
        print(f"No matches found for source sentence '{src_sent}'")
        glossary_matches.append(("", ""))

# Transform glossary_matches into a list of tuples (A, B) where A is joined string from first elements of each tuple in glossary_matches and B is joined string from second elements of each tuple in glossary_matches
glossary_matches = [(", ".join([match[0] for match in matches]),
                                 ", ".join([match[1] for match in matches])) for matches in glossary_matches]

# separate the glossary_matches into two lists: first elements and second elements
glossary_matches_src = [match[0] for match in glossary_matches]
glossary_matches_tgt = [match[1] for match in glossary_matches]

fuzzy_source_sentences = glossary_matches_src
fuzzy_target_prefixes = glossary_matches_tgt

Matches found for source sentence 'The UN Environment Programme (UNEP) and the Food and Agriculture Organization of the UN (FAO) have named the first World Restoration Flagships for this year, tackling pollution, unsustainable exploitation, and invasive species in three continents.': 4 - [('pollution', 'contaminación'), ('World Restoration Flagship', 'Iniciativa Emblemática de la Restauración Mundial'), ('UN Environment Programme', 'Programa ONU Medio Ambiente'), ('invasive alien species', 'Especie exótica invasiva')]
Matches found for source sentence 'These initiatives are restoring almost five million hectares of marine ecosystems - an area about the size of Costa Rica, which, together with France, is hosting the 3rd UN Ocean Conference.': 4 - [('marine ecosystems', 'ecosistemas marinos'), ('Costa Rica', 'Costa Rica'), ('France', 'Francia'), ('conference', 'conferencias')]
Matches found for source sentence 'The three new flagships comprise restoration initiatives in the coral-rich No

## Translation inserting matches

In [53]:
import ctranslate2
import sentencepiece as spm
import torch

src_lang = "eng_Latn"
tgt_lang = "spa_Latn"

beam_size = 2

# Load the source SentecePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)


# Subword the source sentences
fuzzy_source_sentences_subworded = sp.encode_as_pieces(fuzzy_source_sentences)
real_source_sentences_subworded = sp.encode_as_pieces(source_sents)
fuzzy_real_subworded = zip(fuzzy_source_sentences_subworded, real_source_sentences_subworded)

separator = sp.encode_as_pieces("•")  # tokenize "•" -- output is "▁•"

source_sents_subworded = [[src_lang] + fuzzy_src + [src_lang] + separator + real_src + ["</s>"]
                          for fuzzy_src, real_src in fuzzy_real_subworded]
#source_sents_subworded = [[src_lang] + fuzzy_src + [src_lang] + separator + ["</s>"]
                          #for fuzzy_src in fuzzy_source_sentences_subworded]
print(source_sents_subworded[0])

prefixes_subworded = sp.encode_as_pieces(fuzzy_target_prefixes)
target_prefixes = [[tgt_lang] + sent + [tgt_lang] + separator for sent in prefixes_subworded]
print(target_prefixes[0])

# Translate the source sentences
translator = ctranslate2.Translator(ct_model_path, device=device)
translations = translator.translate_batch(source_sents_subworded,
                                          batch_type="tokens",
                                          max_batch_size=2024,
                                          beam_size=beam_size,
                                          min_decoding_length=2,
                                          max_decoding_length=512,
                                          target_prefix=target_prefixes)
translations = [translation.hypotheses[0] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]

translations_only = [sent.split(tgt_lang)[1].strip() for sent in translations_desubword]

print("\nTranslations:", *translations_desubword[:10], sep="\n")
print("\nTranslations only:", *translations_only[:10], sep="\n")

['eng_Latn', '▁pollu', 'tion', ',', '▁World', '▁Rest', 'oration', '▁Flag', 'ship', ',', '▁UN', '▁Environment', '▁Programme', ',', '▁invasi', 've', '▁alien', '▁species', 'eng_Latn', '▁•', '▁The', '▁UN', '▁Environment', '▁Programme', '▁(', 'UN', 'EP', ')', '▁and', '▁the', '▁Food', '▁and', '▁Agric', 'ulture', '▁Organization', '▁of', '▁the', '▁UN', '▁(', 'FA', 'O', ')', '▁have', '▁named', '▁the', '▁first', '▁World', '▁Rest', 'oration', '▁Flag', 'shi', 'ps', '▁for', '▁this', '▁year', ',', '▁tack', 'ling', '▁pollu', 'tion', ',', '▁uns', 'usta', 'inable', '▁explo', 'itation', ',', '▁and', '▁invasi', 've', '▁species', '▁in', '▁three', '▁contin', 'ents', '.', '</s>']
['spa_Latn', '▁contamina', 'ción', ',', '▁Inici', 'ativa', '▁Emb', 'lem', 'ática', '▁de', '▁la', '▁Resta', 'uración', '▁Mundial', ',', '▁Programa', '▁ONU', '▁Medio', '▁Ambiente', ',', '▁Es', 'pe', 'cie', '▁ex', 'ó', 'tica', '▁invasi', 'va', 'spa_Latn', '▁•']

Translations:
contaminación, Iniciativa Emblemática de la Restauración Mu

In [50]:
translations_only = [sent.split(tgt_lang)[1].strip() for sent in translations_desubword]
translations_only = [sent[1:].strip() if sent.startswith("•") else sent.strip() for sent in translations_only]

In [17]:
translations_only[0]

'El Programa de las Naciones Unidas para el Medio Ambiente (PNUMA) y la Organización de las Naciones Unidas para la Alimentación y la Agricultura (FAO) han nombrado las primeras Iniciativas Emblemáticas de la Restauración Mundial para este año, que abordan la contaminación, la explotación insostenible y las especies invasoras en tres continentes.'

In [46]:
# Save the translations

translations_file_name = "testUNEP.es"

with open(translations_file_name, "w+") as output:
  for translation in translations_only:
    output.write(translation + "\n")

# Show in parallel print each line of testUNEP.en and testUNEP.es



In [51]:
# translations_desubword and translations_only
print(len(translations_desubword))
print(len(translations_only))

for i in range(len(translations_desubword)):
  print(translations_desubword[i])
  if i > len(translations_only) - 1:
    print()
  else:
    print(translations_only[i])
    print(source_sents[i])
  print()

12
12
contaminación , Iniciativa Emblemática de la Restauración Mundial , Programa ONU Medio Ambiente , Especie exótica invasivaspa_Latn • El Programa de las Naciones Unidas para el Medio Ambiente (PNUMA) y la Organización de las Naciones Unidas para la Alimentación y la Agricultura (FAO) han nombrado las primeras Iniciativas Emblemáticas de la Restauración Mundial para este año, que abordan la contaminación, la explotación insostenible y las especies invasoras en tres continentes.
El Programa de las Naciones Unidas para el Medio Ambiente (PNUMA) y la Organización de las Naciones Unidas para la Alimentación y la Agricultura (FAO) han nombrado las primeras Iniciativas Emblemáticas de la Restauración Mundial para este año, que abordan la contaminación, la explotación insostenible y las especies invasoras en tres continentes.
The UN Environment Programme (UNEP) and the Food and Agriculture Organization of the UN (FAO) have named the first World Restoration Flagships for this year, tacklin

In [None]:
# zip and download folder "/content/models" to avoid conversion at every run

#!zip -r /content/models.zip /content/models

#download zip file from Google Colab
#from google.colab import files
#files.download("/content/models.zip")


  adding: content/models/ (stored 0%)
  adding: content/models/.ipynb_checkpoints/ (stored 0%)
  adding: content/models/flores200_sacrebleu_tokenizer_spm.model (deflated 51%)
  adding: content/models/ct2-nllb-200-1.3B-int8/ (stored 0%)
  adding: content/models/ct2-nllb-200-1.3B-int8/model.bin (deflated 8%)
  adding: content/models/ct2-nllb-200-1.3B-int8/config.json (deflated 44%)
  adding: content/models/ct2-nllb-200-1.3B-int8/shared_vocabulary.json (deflated 72%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Glossary simple basic replacement

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Load glossary
glossary = pd.read_csv(r"C:\Users\Nelso\Downloads\Dofus_names_translations.csv")
glossary_dict = dict(zip(glossary['en'].astype(str), glossary['es'].astype(str)))

# Add "Prysmarodoth:Prismarodona" to dict
glossary_dict["Prysmarodoth"] = "Prismaradona"

# Exact replace function (no fuzzy matching, preserves case)
def exact_replace(text, glossary_dict):
    """Replace exact matches in text while preserving case and punctuation"""
    for english_term, spanish_term in glossary_dict.items():
        # Use word boundaries to match complete words only
        pattern = r'\b' + re.escape(english_term) + r'\b'
        text = re.sub(pattern, spanish_term, text)
    return text

# Try different encodings
file_path = r"C:\Users\Nelso\Downloads\Cronología Krosmoz.htm"
encodings_to_try = ['latin1', 'iso-8859-1', 'cp1252', 'utf-8']

html_content = None
used_encoding = None

for encoding in encodings_to_try:
    try:
        with open(file_path, "r", encoding=encoding) as file:
            html_content = file.read()
            used_encoding = encoding
            print(f"Successfully loaded with encoding: {encoding}")
            break
    except UnicodeDecodeError:
        print(f"Failed with encoding: {encoding}")
        continue

if html_content is None:
    print("Could not decode the file with any of the tried encodings")
    exit()

# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Test for exact_replace
print("Test replacement:")
print(exact_replace("Prysmarodoth", glossary_dict))
print(exact_replace("This is Prysmarodoth and other text", glossary_dict))

# Function to process text nodes while preserving HTML structure
def process_text_content(element):
    """Recursively process text content in HTML elements"""
    if element.name in ['script', 'style']:
        # Skip script and style tags
        return
    
    if element.string:
        # If element has direct text content, process it
        new_text = exact_replace(element.string, glossary_dict)
        element.string.replace_with(new_text)
    else:
        # If element has children, process them recursively
        for child in element.children:
            if hasattr(child, 'name'):
                process_text_content(child)
            elif hasattr(child, 'replace_with'):
                # Process text nodes
                new_text = exact_replace(str(child), glossary_dict)
                child.replace_with(new_text)

# Process the HTML content
# Uncomment this when ready to apply changes
process_text_content(soup.body if soup.body else soup)

# Alternative: Process specific elements (paragraphs, table cells, etc.)
# Uncomment to process paragraphs
# for para in soup.find_all('p'):
#     if para.get_text(strip=True):
#         process_text_content(para)

# Uncomment to process table cells
# for cell in soup.find_all(['td', 'th']):
#     if cell.get_text(strip=True):
#         process_text_content(cell)

# Uncomment to process headings
# for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
#     if heading.get_text(strip=True):
#         process_text_content(heading)

# Save updated HTML document
# Uncomment when ready to save
# with open("/content/Cronología_Krosmoz_glossary_corrected.htm", "w", encoding="utf-8") as file:
#     file.write(str(soup))

print("HTML processing setup complete. Uncomment the processing sections when ready to apply changes.")

Successfully loaded with encoding: latin1
Test replacement:
Prismaradona
This is Prismaradona and other text


KeyboardInterrupt: 