# TERM SEEKER
Type an English term and get term candidates in the requested UN languages from UN official documents as only source.

## Installation

In [None]:
import shutil
import os
repo_url = "https://github.com/NelsonJQ/termseeker"
folder_name = "termseeker"

# Your current working directory
#print(os.getcwd()) # if you use your local machine

# if you use Google colab:
%cd /content        

# Remove folder if exists
if os.path.exists(folder_name):
    shutil.rmtree(folder_name)

# Clone the repository
!git clone {repo_url}

%cd termseeker

#!pip3 uninstall termseeker -y

#########################################
# Install the package
#########################################

# If you use your machine and already have all the dependencies installed
# you can install the package without dependencies
#!python3 -m pip install . --no-deps

# If it is the first time you are installing the package
!python3 -m pip install .

## Playground

In [None]:
# Import the main function
from termseeker import getCandidates, consolidate_results

#########################################
# Define the source term (only one)
input_search_text = "chemical plant"
#########################################

#########################################
# Define the target languages (one or many), only UN languages are supported
input_lang = ["Spanish", "French"]
#input_lang = ["Spanish"]
#########################################

#########################################
# Define the UN document sources to be used (none, one or many)
input_filterSymbols = ["UNEP/EA", "S", "FCCC"]
#########################################

#########################################
# Define the maximum number of source UN documents to be used
# 50 is the limit. 2 or 3 are suggested
sourcesQuantity = 2
#sourcesQuantity = 1
#########################################

#########################################
# Define the number of paragraphs per document to search for the source term
# 1 or 2 are suggested
#paragraphsPerDoc = 2
paragraphsPerDoc = 2
#########################################


# Define if the UN draft documents should be ignored
eraseDrafts = True


# Let us start! The main function will return the result
result= None
result = getCandidates(input_search_text, input_lang, input_filterSymbols, sourcesQuantity, paragraphsPerDoc, eraseDrafts, localLM=True)
print(result)


## Table visualization of results per term

In [None]:
import polars as pl

# Get the consolidated results as a python list and export its DataFrame to an Excel file
consolidated_results = consolidate_results(result.copy(), exportExcel=False)
print(consolidated_results)
consolidated_df = pl.DataFrame(consolidated_results, strict=False)

# Get a Polars DataFrame from the consolidated results
df = pl.DataFrame(result.copy(), strict=False)
df

In [None]:
consolidated_df

# Test on uploaded DataFrame

In [None]:
result_df.write_excel("term2-200Library.xlsx")

In [1]:
from termseeker.getcandidates import getCandidates
from termseeker.utils import consolidate_results
import polars as pl
import nltk
nltk.download('punkt')

def process_UNTermDF(df):
    """
    Process a UNTerm dataframe by filling in missing language translations.
    
    Args:
        df (pl.DataFrame): DataFrame containing terminology data
        
    Returns:
        pl.DataFrame: DataFrame with filled language data
    """
    
    # Check if the English column exists, else use "term" column
    english_col = "English" if "English" in df.columns else "term"
    if english_col not in df.columns:
        raise ValueError(f"Could not find English term column in the Excel file. Expected 'English' or 'term'.")
    
    # Language columns to check
    languages = ["French", "Spanish", "Russian", "Chinese", "Arabic"]
    
    # Track all keys that appear in any dictionary
    all_keys = set(df.columns)
    
    # Process each row
    processed_data = []
    
    # Check English term for each row, using "term" column as fallback
    for i, row in enumerate(df.iter_rows(named=True)):
        row_dict = dict(row)
        
        # First try to get value from "English" column, fall back to "term" if empty
        english_term = None
        if "English" in row_dict and row_dict["English"] and row_dict["English"] != "None":
            english_term = row_dict["English"]
        elif "term" in row_dict and row_dict["term"] and row_dict["term"] != "None":
            english_term = row_dict["term"]
            # If we're using the term column, ensure it's also set in the English field
            row_dict["English"] = english_term
        
        # Skip if no English term found in either column
        if not english_term:
            processed_data.append(row_dict)
            continue
        
        # Identify missing languages
        missing_langs = []
        for lang in languages:
            # Check if language column exists and if value is missing
            if lang in row_dict and (row_dict[lang] is None or row_dict[lang] == "" or row_dict[lang] == "None"):
                missing_langs.append(lang)
        
        # Check if paragraph columns have content and remove those languages from missing_langs
        for lang in list(missing_langs):  # Use list() to create a copy for safe iteration
            paragraph_col = f"{lang}Paragraphs"
            if (paragraph_col in row_dict and 
                row_dict[paragraph_col] is not None and 
                row_dict[paragraph_col] != "" and 
                row_dict[paragraph_col] != "None"):
                missing_langs.remove(lang)
                print(f"  Skipping '{lang}' because '{paragraph_col}' already has content")
        
        # If there are missing languages, use getCandidates to fill them
        if missing_langs:
            print(f"Processing term ({i+1}/{len(df)}): '{english_term}' - Missing languages: {missing_langs}")
            
            try:
                # Get candidates using getCandidates
                results = getCandidates(
                    input_search_text=english_term,
                    input_lang=missing_langs,
                    input_filterSymbols=["UNEP", "CBD", "FCCC"],
                    #input_filterSymbols=["UNEP/EA"],
                    #input_filterSymbols=["S"],
                    sourcesQuantity=2,
                    paragraphsPerDoc=2,
                    eraseDrafts=True,
                    localLM=None
                )
                
                # Consolidate results if we got any
                if results:
                    consolidated = consolidate_results(results, exportExcel=False)
                    
                    # If consolidated results available, merge with row data
                    if consolidated and len(consolidated) > 0:
                        # Extract first item (dictionary) from consolidated results
                        result_dict = consolidated[0]
                        
                        # Add all fields from consolidated results to row_dict
                        for key, value in result_dict.items():
                            if value is not None:
                                # Add this key to the set of all keys
                                all_keys.add(key)
                                # Add as new column regardless of whether it exists in original data
                                row_dict[key] = value
            except Exception as e:
                print(f"Error processing term '{english_term}': {str(e)}")
        
        processed_data.append(row_dict)

    return processed_data

# Get polars DataFrame from Excel file
path_file = "C:\\Users\\Nelso\\Documents\\TermSeeker\\compiled1-1900UNEAonly.xlsx"
df = pl.read_excel(path_file)

# slice df 
df = df.head(800)
#df = df.slice(1700, 200)
#df = df.tail(2)
results = process_UNTermDF(df)


# Find all possible keys across all dictionaries
all_possible_keys = set()
for row_dict in results:
    all_possible_keys.update(row_dict.keys())

# Normalize all dictionaries to have the same keys
normalized_data = []
for row_dict in results:
    normalized_dict = {key: row_dict.get(key, None) for key in all_possible_keys}
    normalized_data.append(normalized_dict)

# Create a new DataFrame from the normalized data
result_df = pl.DataFrame(normalized_data)

result_df

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nelso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  Skipping 'French' because 'FrenchParagraphs' already has content
  Skipping 'Spanish' because 'SpanishParagraphs' already has content
  Skipping 'Russian' because 'RussianParagraphs' already has content
  Skipping 'Chinese' because 'ChineseParagraphs' already has content
  Skipping 'Arabic' because 'ArabicParagraphs' already has content
Processing term (2/800): 'absorptive capacity' - Missing languages: ['French', 'Spanish', 'Russian', 'Chinese', 'Arabic']
https://digitallibrary.un.org/search?&ln=en&as=1&so=d&rg=50&c=Resource%20Type&c=UN%20Bodies&of=hb&fti=1&fti=1&as_query=JTdCJTIyZGF0ZV9zZWxlY3RvciUyMiUzQSUyMCU3QiUyMmRhdGVUeXBlJTIyJTNBJTIwJTIyY3JlYXRpb25fZGF0ZSUyMiUyQyUyMCUyMmRhdGVQZXJpb2QlMjIlM0ElMjAlMjJzcGVjaWZpY2RhdGVwZXJpb2QlMjIlMkMlMjAlMjJkYXRlRnJvbSUyMiUzQSUyMCUyMjIwMTYtMDEtMDElMjIlMkMlMjAlMjJkYXRlVG8lMjIlM0ElMjAlMjIyMDI1LTAyLTE3JTIyJTdEJTJDJTIwJTIyY2xhdXNlcyUyMiUzQSUyMCU1QiU3QiUyMnNlYXJjaEluJTIyJTNBJTIwJTIyZG9jdW1lbnRzeW1ib2wlMjIlMkMlMjAlMjJjb250YWluJTIyJTNBJTIwJTIyYW55LXdvcm

SpanishParagraphs,English,RussianParagraphs,publicationDate,SpanishSynonyms,ChineseParagraphs,term,Arabic,docType,isUnterm,Chinese,FrenchParagraphs,error,Spanish,docTitle,RussianSynonyms,FrenchSynonyms,UNTerm_Source,ArabicParagraphs,French,ArabicSynonyms,ChineseSynonyms,EnglishParagraphs,EnglishTerm,docSymbol,Russian
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""37. **El PNUMA hará pleno uso …","""absorption""","""19. **Реформа системы развития…","""2021-02-17_x000D_ 2020-11-11""","""{'Synonyms': [], 'Similar': ['…","""(e) **促进绿色投资，实现包容和可持续的农村转型，以支持…",,,"""Documents and Publications""","""NotFound""",,"""37. **Le PNUE mettra pleinemen…",,,"""For people and planet: the Uni…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""None""","""وسيعمل برنامج األمم المتحدة لل…",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""(e) **_Catalysing green invest…","""absorption""","""UNEP/EA.5/3/REV.1_x000D_ UNEP…",
"""**D. Capacidad de absorción y …","""absorptive capacity""","""2. Ускорять разработку, демонс…","""2004-04-06 2009-11-20 2008-05-…","""{'Synonyms': [], 'Similar': ['…","""项目 33 184.67 616.39 801.06 共计 …",,,"""Documents and Publications Rep…","""NotFound""",,"""**D. Capacité d’absorption et …",,,"""Implementation of decisions 12…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': [""…","""None""","""[(][٢] [)]٢ﺍﳋﻴﺎﺭ ###### ﻣﻦ [ ﺃ…",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""D. Absorptive capacity and cou…","""absorptive capacity""","""FCCC/SBI/2004/6 FCCC/AWGLCA/20…",
,"""abuse of power""",,,"""{'Synonyms': [], 'Similar': ['…",,,"""التعسف في استعمال السلطة""",,"""UNTerm""","""滥用权力""",,,"""abuso de poder""",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': [""…","""{'source': 'UNOG', 'tags': ['I…",,"""abus de pouvoir""","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…",,,,"""злоупотрeбление властью"""
"""14. Para que los marcos de pol…","""acceptability""","""14. Для обеспечения приоритетн…","""2020-11-10""","""{'Synonyms': [], 'Similar': ['…","""14. 高级别的政治领导对于确保产品政策框架成为国家政治议程…",,,"""Reports""","""NotFound""",,"""14. Un leadership politique de…",,,"""Progress in the implementation…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': [""…","""None""",""".وتعزيز التعاون بين القطاعين ا…",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""14. High-level political leade…","""acceptability""","""UNEP/EA.5/4""",
,"""access and benefit-sharing""",,,"""{'Synonyms': ['acceso y partic…",,,"""إتاحة الموارد الوراثية وتقاسم …",,"""UNTerm""","""获取和惠益分享""",,,"""acceso a los recursos biológic…",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': ['accès aux resso…","""{'source': 'UNON', 'tags': ['U…",,"""accès [aux ressources génétiqu…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…",,,,"""доступ к генетическим ресурсам…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""**El PNUMA promoverá la integr…","""economic policy""","""**ЮНЕП будет содействовать инт…","""2021-02-17""","""{'Synonyms': [], 'Similar': ['…","""**环境署将在支持金融和经济转型时促进将环境层面融入其中。加…",,,"""Documents and Publications""",,,"""**Le PNUE encouragera l’intégr…",,,"""For people and planet: the Uni…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""None""","""وسيؤدي **واالقتصادية . برنامج …",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""**UNEP will promote integratio…","""economic policy""","""UNEP/EA.5/3/REV.1""",
,"""economic practice""",,,"""{'Synonyms': [], 'Similar': []…",,,,,,,,,,,"""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…","""None""",,,"""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…",,,,
,"""economic recession""",,,"""{'Synonyms': ['recesión'], 'Si…",,,"""ركود اقتصادي""",,,"""经济衰退""",,,"""recesión económica""",,"""{'Synonyms': ['рецессия'], 'Si…","""{'Synonyms': ['récession'], 'S…","""{'source': 'UNHQ', 'tags': ['E…",,"""récession économique""","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': ['衰退'], 'Similar'…",,,,"""экономическая рецессия"""
"""_Reconociendo que la recuperac…","""economic recovery""","""8. _предлагает государствам-чл…","""2022-03-07""","""{'Synonyms': [], 'Similar': ['…","""申明全球卫生治理应旨在有助于预防、发现、更好地防备和应对危机…",,,"""Resolutions and Decisions""",,,"""1. _Prie la Directrice exécuti…",,,"""5/6. Biodiversity and health :…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""None""","""،اآلمنة والفعالة وتحسين الوصول…",,"""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…","""_Acknowledging that economic r…","""economic recovery""","""UNEP/EA.5/RES.6 _x000D__x000D_…",


In [2]:
#export df to excel
result_df.write_excel("part1-340_UNEP-CBD-FCCC.xlsx")

<xlsxwriter.workbook.Workbook at 0x1695f80d0a0>

In [None]:
import polars as pl
path_file = "C:\\Users\\Nelso\\Documents\\TermSeeker\\terms2-1935.xlsx"
df2 = pl.read_excel(path_file)
df2 = df2.slice(1478, 5)
#df = df.tail(2)

#results = process_UNTermDF(df)


# Find all possible keys across all dictionaries
all_possible_keys = set()
for row_dict in []:
    all_possible_keys.update(row_dict.keys())

# Normalize all dictionaries to have the same keys
normalized_data = []
for row_dict in []:
    normalized_dict = {key: row_dict.get(key, None) for key in all_possible_keys}
    normalized_data.append(normalized_dict)

# Create a new DataFrame from the normalized data
#result_df = pl.DataFrame(normalized_data)

#result_df.write_excel("test1600-1900UNEAonly.xlsx")
df2

English,French,Spanish,Russian,Chinese,Arabic,UNTerm_Source,isUnterm,FrenchSynonyms,SpanishSynonyms,RussianSynonyms,ChineseSynonyms,ArabicSynonyms,term,error
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""monitoring data""",,,,,,"""None""","""NotFound""","""{'Synonyms': [], 'Similar': [""…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…",,
,,,,,,"""None""","""NotFound""","""None""","""None""","""None""","""None""","""None""","""mortgage""","""Search error: cannot access lo…"
"""motor fuel""",,,,,,"""None""","""NotFound""","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…",,
"""motor vehicle""","""véhicule à moteur""",,"""автомобиль""","""机动车辆""","""سيارة""","""{'source': 'UNOG', 'tags': ['E…","""UNTerm""","""{'Synonyms': ['véhicule automo…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': ['автотранспортно…","""{'Synonyms': [], 'Similar': ['…","""{'Synonyms': [], 'Similar': ['…",,
"""motor vehicle emission""",,,,,,"""None""","""NotFound""","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…","""{'Synonyms': [], 'Similar': []…",,


## Debugging

In [None]:
from termun.utils import *
from termun.convert import *

ESPpdf_path = "n1903161.pdf"
ENGpdf_path = "n1903158.pdf"

# Extract text from PDF
mdES = convert_pdf_to_markdown(ESPpdf_path)
mdEN = convert_pdf_to_markdown(ENGpdf_path)
print(mdEN)

In [None]:
engParatest = find_paragraphs_with_merge(mdEN, "enhancing mutual trust and removing", max_paragraphs=2)
print(engParatest)
espParagraphs = find_similar_paragraph_in_target(engParatest[0],
                                                 mdES,
                                                 model_name='distiluse-base-multilingual-cased-v2',
                                                 top_k=2)


print("\n\n")
print(espParagraphs)

In [None]:
from termun.utils import *
from termun.convert import *
testkw = "atmosphere begs"
engParas = find_paragraphs_with_merge2(mdEN, testkw, 2)
espParas = find_paragraphs_with_merge2(mdES, "hecho que ha sido condenado, incluso", 2)
print(espParas)
print(engParas)

Merging partial outputs

In [5]:
import polars as pl

file1 = "../compiled200-1900UNEAonly.xlsx"
file2 = "../output_tests/test1-200UNEAonly.xlsx"


file_paths = [file1, file2]

# Read Excel files as polars DataFrames
df_list = [pl.read_excel(file_path) for file_path in file_paths]

# Option 1: Use union of columns (keep all columns)
# Get all unique columns from all dataframes
all_columns = set()
for df in df_list:
    all_columns.update(df.columns)

# Make sure each dataframe has all columns with consistent data types
dfs = []
for df in df_list:
    # Add missing columns with null values
    for col in all_columns:
        if col not in df.columns:
            # Add the column with a consistent type (e.g., String)
            df = df.with_columns(pl.lit(None).cast(pl.Utf8).alias(col))
        else:
            # Ensure the column has a consistent type (e.g., String)
            df = df.with_columns(df[col].cast(pl.Utf8))
    # Ensure columns are in the same order
    dfs.append(df.select(sorted(all_columns)))

# Concatenate the normalized DataFrames
result_df = pl.concat(dfs)

result_df

Could not determine dtype for column 23, falling back to string


Arabic,ArabicParagraphs,ArabicSynonyms,Chinese,ChineseParagraphs,ChineseSynonyms,English,EnglishParagraphs,EnglishTerm,French,FrenchParagraphs,FrenchSynonyms,Russian,RussianParagraphs,RussianSynonyms,Spanish,SpanishParagraphs,SpanishSynonyms,UNTerm_Source,docSymbol,docTitle,docType,error,isUnterm,publicationDate,term
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""الطاقة الحيوية""",,"""{'Synonyms': [], 'Similar': ['…","""生物能""",,"""{'Synonyms': [], 'Similar': ['…","""bioenergy""",,,"""bioénergie""",,"""{'Synonyms': [], 'Similar': ['…","""биоэнергия""",,"""{'Synonyms': [], 'Similar': ['…","""bioenergía""",,"""{'Synonyms': [], 'Similar': ['…","""{'source': 'UNOG', 'tags': ['E…",,,,,,,
"""الإيثانول الأحيائي""",,"""{'Synonyms': [], 'Similar': []…","""生物乙醇""",,"""{'Synonyms': [], 'Similar': []…","""bioethanol""",,,"""bioéthanol""",,"""{'Synonyms': [], 'Similar': []…","""биоэтанол""",,"""{'Synonyms': [], 'Similar': []…","""bioetanol""",,"""{'Synonyms': [], 'Similar': []…","""{'source': 'UNOG', 'tags': ['E…",,,,,,,
"""وقود أحيائي""",,"""{'Synonyms': [], 'Similar': ['…","""生物燃料""",,"""{'Synonyms': [], 'Similar': ['…","""biofuel""",,,"""biocombustible""",,"""{'Synonyms': ['biocarburant'],…","""биотопливо""",,"""{'Synonyms': [], 'Similar': ['…","""biocombustible""",,"""{'Synonyms': ['biocarburante']…","""{'source': 'UNOG', 'tags': ['E…",,,,,,,
"""غاز حيوي""",,"""{'Synonyms': [], 'Similar': ['…","""沼气""",,"""{'Synonyms': [], 'Similar': ['…","""biogas""",,,"""biogaz""",,"""{'Synonyms': [], 'Similar': ['…","""биогаз""",,"""{'Synonyms': [], 'Similar': ['…","""biogás""",,"""{'Synonyms': [], 'Similar': ['…","""{'source': 'UNOG', 'tags': ['E…",,,,,,,
,,"""{'Synonyms': [], 'Similar': ['…",,,"""{'Synonyms': [], 'Similar': ['…","""biogeochemistry""",,,,,"""{'Synonyms': [], 'Similar': [""…",,,"""{'Synonyms': [], 'Similar': ['…",,,"""{'Synonyms': [], 'Similar': ['…","""None""",,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""تحلل أحيائي""",,"""{'Synonyms': [], 'Similar': ['…","""生物降解""",,"""{'Synonyms': [], 'Similar': ['…","""biodegradation""",,,"""biodégradation""",,"""{'Synonyms': [], 'Similar': ['…","""биодеградация""",,"""{'Synonyms': [], 'Similar': ['…","""biodegradación""",,"""{'Synonyms': [], 'Similar': ['…","""{'source': 'UNHQ', 'tags': ['S…",,,,,"""UNTerm""",,
,"""إىل برنامج األمم ادلتحدة للبيئ…","""{'Synonyms': [], 'Similar': ['…",,"""### 2/10. 海洋 联合国环境大会， 认识到海洋环境，…","""{'Synonyms': [], 'Similar': ['…","""biodiversity""","""_Gravely concerned about threa…","""biodiversity""",,"""## 2/10. Mers et océans _L’Ass…","""{'Synonyms': [], 'Similar': ['…",,"""_Ассамблея Организации Объедин…","""{'Synonyms': [], 'Similar': ['…",,"""**Asamblea de las Naciones Uni…","""{'Synonyms': [], 'Similar': ['…","""None""","""UNEP/EA.2/RES.10""","""2/10. Oceans and seas : resolu…","""Resolutions and Decisions""",,"""NotFound""","""2016-08-04""",
"""حفظ التنوع البيولوجي""",,"""{'Synonyms': [], 'Similar': ['…","""生物多样性养护""",,"""{'Synonyms': [], 'Similar': ['…","""biodiversity conservation""",,,"""préservation de la biodiversit…",,"""{'Synonyms': [], 'Similar': ['…","""сохранение биологического разн…",,"""{'Synonyms': [], 'Similar': ['…","""conservación de la diversidad …",,"""{'Synonyms': ['conservación de…","""{'source': 'UNHQ', 'tags': ['B…",,,,,"""UNTerm""",,
"""فقدان التنوع البيولوجي""",,"""{'Synonyms': [], 'Similar': ['…","""生物多样性的丧失""",,"""{'Synonyms': [], 'Similar': ['…","""biodiversity loss""",,,"""perte de biodiversité""",,"""{'Synonyms': ['diminution de l…","""утрата биоразнообразия""",,"""{'Synonyms': ['сокращение биол…","""pérdida de diversidad biológic…",,"""{'Synonyms': ['pérdida de biod…","""{'source': 'UNOG', 'tags': ['E…",,,,,"""UNTerm""",,


In [6]:
# Export df to excel
result_df.write_excel("compiled1-1900UNEAonly.xlsx")

<xlsxwriter.workbook.Workbook at 0x26593232240>

In [None]:
# print df structure, column names
print(result_df.schema)

## Multilingual LLM prompt for batch extraction

In [None]:
from openai import OpenAI
import polars as pl
import ast
import json
import os
from groq import Groq
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
api_key = os.environ.get("GROQ_API_KEY")

def askLLM_batchextraction(sourceTerm, sourceLanguage="English", 
                          contexts=None, url='http://localhost:1234/v1'):
    """
    Generate translations of a source term into multiple languages using local LLM API with structured JSON output.
    
    Args:
        sourceTerm (str): The term to translate
        sourceLanguage (str): The language of the source term (default: "English")
        contexts (dict): Dictionary containing context for each target language with format:
                        {
                            "ES": {"context": str, "similar": str, "synonyms": list},
                            "FR": {"context": str, "similar": str, "synonyms": list},
                            ...
                        }
        url (str): The base URL of the local language model API (default: 'http://localhost:1234/v1')
    
    Returns:
        dict: JSON response with translations in multiple languages
    """
    
    # Initialize OpenAI client with local endpoint
    #client = OpenAI(base_url=url, api_key="lm-studio")
    client = Groq()

    # Language codes and their full names
    language_map = {
        "ES": "Spanish",
        "FR": "French", 
        "CH": "Chinese", 
        "RU": "Russian", 
        "AR": "Arabic"
    }
    
    # Create the prompt as a nested dictionary (will be converted to JSON)
    prompt_data = {
        "sourceTerm": sourceTerm,
        "sourceLanguage": sourceLanguage,
        "outputFormat": "JSON",
        "outputStyle": {
            "type": "json_schema",
            "json_schema": {
                "name": "translations",
                "schema": {
                    "type": "object",
                    "properties": {
                        "terms": {
                            "type": "object",
                            "properties": {
                                "English": {
                                    "type": "string",
                                    "description": f"The original {sourceLanguage} term being translated",
                                    "enum": [sourceTerm]  # This restricts it to exactly the sourceTerm value
                                },
                                "Français": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                    "minItems": 1,
                                    "maxItems": 4,
                                    "description": f"List of French translations for the term '{sourceTerm}' based on the context provided"
                                },
                                "Español": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                    "minItems": 1,
                                    "maxItems": 4,
                                    "description": f"List of Spanish translations for the term '{sourceTerm}' based on the context provided"
                                },
                                "简体中文": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                    "minItems": 1,
                                    "maxItems": 4,
                                    "description": f"List of Simplified Chinese translations for the term '{sourceTerm}' based on the context provided"
                                },
                                "Русский": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                    "minItems": 1,
                                    "maxItems": 4,
                                    "description": f"List of Russian translations for the term '{sourceTerm}' based on the context provided"
                                },
                                "العربية": {
                                    "type": "array",
                                    "items": {"type": "string"},
                                    "minItems": 1,
                                    "maxItems": 4,
                                    "description": f"List of Arabic translations for the term '{sourceTerm}' based on the context provided"
                                }
                            },
                            "required": ["English", "Español", "简体中文", "Français", "Русский", "العربية"]
                        }
                    },
                    "required": ["terms"]
                }
            }
        }
    }

    # Prepare context information for each language
    contexts = contexts or {}
    
    for lang_code, lang_name in language_map.items():
        context_info = contexts.get(lang_code, {})
        context_text = context_info.get("context", "")
        similar_text = context_info.get("similar", "")
        synonyms_list = context_info.get("synonyms", [])
        
        # Add context to the prompt data
        prompt_data[f"{lang_name}Context"] = {
            "Synonyms": synonyms_list,
            "documents": f"{context_text}\n\n{similar_text}".strip()
        }
    
    # Convert to JSON string for the prompt
    prompt_json = json.dumps(prompt_data, ensure_ascii=False, indent=2)
    
    # Define the response format for structured output
    response_format = {
        "type": "json_object",
        "json_object": {
            "name": "translations",
            "schema": prompt_data["outputStyle"]["json_schema"]["schema"]
        }
    }
    
    try:
        
        final_prompt = f"Suggest a translation for <sourceterm>{sourceTerm}</sourceterm> in <targetlanguages>Russian, Spanish, Arabic, French, Simplified Chinese</targetlanguages> based on mentions of contextual documents and synonyms here below. Do not provide similar terms but the proper translation of source English string." + prompt_json
        # Create a chat completion
        completion = client.chat.completions.create(
            #model="model-identifier",  # not essential for LM Studio
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": "You are a helpful multilingual assistant that understands English, French, Simplified Chinese, Arabic, Russian and Spanish. You suggest accurate translations of a single input term based on provided context."},
                {"role": "user", "content": final_prompt}
            ],
            temperature=0.05,
            response_format=response_format,
            max_completion_tokens=1230,
            stream=False,
            stop=None,
            top_p=1
        )
        

        # Parse the response as JSON
        #response_content = completion.choices[0].message.content
        response_content = completion.choices[0].message.content
        
        return json.loads(response_content)
    
    except Exception as e:
        print(f"Error in LLM batch extraction: {str(e)}")
        return {"error": str(e)}


def process_dataframe_for_term_extraction(df, url='http://localhost:1234/v1'):
    """
    Process a polars dataframe and extract translations for each term using askLLM_batchextraction.
    Includes all rows in the output, with None for translation_result when EnglishTerm is missing.
    
    Args:
        df: Polars DataFrame with language context information and English terms
        url: The base URL of the local language model API (default: 'http://localhost:1234/v1')
        
    Returns:
        List of row dictionaries with translation results added
    """
    results = []
    
    # Convert to pandas for row iteration if it's a LazyFrame
    if isinstance(df, pl.LazyFrame):
        df = df.collect()
    
    # Convert to dictionaries for row processing
    rows = df.to_dicts()
    
    for idx, row in enumerate(rows):
        # Get a copy of the row data
        row_data = row.copy()
        
        # Check if English term exists
        english_term = row.get('EnglishTerm', '')
        if english_term is None or english_term == '':
            print(f"Skipping translation for row {idx}: Missing English term")
            # Add the row with None translation_result
            row_data['translation_result'] = None
            results.append(row_data)
            continue
            
        # Get the source term
        source_term = english_term
        print(f"Processing term: {source_term}")
        
        # Prepare contexts for each language
        contexts = {}
        
        # Language codes
        language_codes = ['AR', 'CH', 'FR', 'RU', 'ES']
        
        for lang_code in language_codes:
            # Map language code to full name as in the dataframe
            lang_map = {
                'AR': 'Arabic',
                'CH': 'Chinese',
                'FR': 'French',
                'RU': 'Russian',
                'ES': 'Spanish'
            }
            lang_name = lang_map[lang_code]
            
            # Get context from paragraphs
            context_text = row.get(f"{lang_name}Paragraphs", "")
            if context_text is None:
                context_text = ""
            
            # Parse synonyms information
            synonyms_list = []
            similar_text = ""
            
            synonyms_data = row.get(f"{lang_name}Synonyms", "")
            if synonyms_data is not None and synonyms_data != "":
                try:
                    # Try to parse the synonyms data safely
                    if isinstance(synonyms_data, str):
                        try:
                            synonyms_dict = ast.literal_eval(synonyms_data)
                        except:
                            try:
                                synonyms_dict = json.loads(synonyms_data)
                            except:
                                print(f"Warning: Could not parse {lang_name}Synonyms data")
                                synonyms_dict = {}
                    else:
                        # If it's already a dict, use it directly
                        synonyms_dict = synonyms_data if isinstance(synonyms_data, dict) else {}
                    
                    synonyms_list = synonyms_dict.get('Synonyms', [])
                    similar_items = synonyms_dict.get('Similar', [])
                    similar_text = ", ".join(similar_items) if isinstance(similar_items, list) else str(similar_items)
                    
                except Exception as e:
                    print(f"Error processing {lang_name}Synonyms: {e}")
            
            # Add to contexts
            contexts[lang_code] = {
                "context": context_text,
                "similar": similar_text,
                "synonyms": synonyms_list
            }
        
        # Call the extraction function
        try:
            translation_result = askLLM_batchextraction(
                sourceTerm=source_term,
                sourceLanguage="English",
                contexts=contexts,
                url=url
            )
            
            # Add the result to the row data
            row_data['translation_result'] = json.dumps(translation_result)  # Serialize to JSON string
            results.append(row_data)
            
        except Exception as e:
            print(f"Error in term extraction for {source_term}: {e}")
            # Still add the row but with error information
            row_data['translation_result'] = json.dumps({"error": str(e)})  # Serialize to JSON string
            results.append(row_data)
    
    return results


In [None]:
result_df = pl.read_excel("../compiled200-800UNEAonly.xlsx")

# print row index 559
test_df = result_df.slice(520, 15)
print(test_df)

In [None]:
# slice df to tail 2

processed_results = process_dataframe_for_term_extraction(test_df)


#processed_df = pl.DataFrame(processed_results)

In [None]:
processed_df = pl.DataFrame(processed_results)
processed_df.write_excel("test7LMoutput_Groq15.xlsx")

#{"terms": {"SpanishTerm": ["reconstrucci\u00f3n econ\u00f3mica", "recuperaci\u00f3n econ\u00f3mica", "programa de relance \u00e9conomique", "programa de recuperaci\u00f3n econ\u00f3mica"], "ChineseTerm": ["\u7ecf\u6d4e\u590d\u82cf\u65b9\u6848", "\u7d27\u6025\u7ecf\u6d4e\u590d\u82cf\u65b9\u6848", "\u590d\u82cf\u65b9\u6848", "\u7ecf\u6d4e\u6301\u7eed\u590d\u82cf"], "RussianTerm": ["\u043f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0430 \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u0432\u043e\u0441\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u0438\u044f", "\u0446\u044e\u0440\u0435\u0437\u0432\u044b\u0447\u0430\u0439\u043d\u0430\u044f \u043f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0430 \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u0432\u043e\u0441\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u0438\u044f", "\u043f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0430 \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u0432\u043e\u0441\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u0438\u044f", "\u0446\u044e\u0440\u0435\u0437\u0432\u044b\u0447\u0430\u0439\u043d\u0430\u044f \u043f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0430 \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u0447\u0435\u0441\u043a\u043e\u0433\u043e \u0432\u043e\u0441\u0441\u0442\u0430\u043d\u043e\u0432\u043b\u0435\u043d\u0438\u044f"], "ArabicTerm": ["\u0628\u0631\u0646\u0627\u0645\u062c \u0627\u0644\u0625\u0646\u0639\u0627\u0634 \u0627\u0644\u0627\u0642\u062a\u0635\u0627\u062f\u064a", "\u0628\u0631\u0646\u0627\u0645\u062c \u0627\u0644\u0625\u0646\u0639\u0627\u0634 \u0627\u0644\u0627\u0642\u062a\u0635\u0627\u062f\u064a \u0644\u0644\u0637\u0648\u0627\u0631\u0626", "\u0627\u0646\u062a\u0639\u0627\u0634 \u0627\u0642\u062a\u0635\u0627\u062f\u064a \u0645\u062a\u0648\u0627\u0635\u0644", "\u0627\u0646\u062a\u0639\u0627\u0634 \u0627\u0642\u062a\u0635\u0627\u062f\u064a \u0645\u064f\u0633\u062a\u062f\u064a\u0645"], "FrenchTerm": ["programme de rel\u00e8vement \u00e9conomique", "programme de relance \u00e9conomique d'urgence", "programme de r\u00e9cup\u00e9ration \u00e9conomique", "programme de rel\u00e8vement \u00e9conomique"]}}

In [None]:
def parse_translation_results(df):
    """Parse JSON results and create a more readable DataFrame"""
    results = []
    
    for row in df.iter_rows(named=True):
        try:
            # Get original data
            english_term = row.get('EnglishTerm', 'Unknown')
            
            # Parse the JSON
            if row['translation_result'] is None:
                continue
                
            translation = json.loads(row['translation_result'])
            terms = translation.get('terms', {})
            
            # Create a row for each language
            for lang, term_list in terms.items():
                # Convert list to string for better compatibility
                if isinstance(term_list, list):
                    translations_str = ", ".join(term_list)
                else:
                    translations_str = str(term_list)
                    
                results.append({
                    'EnglishTerm': english_term,
                    'Language': lang,
                    'Translations': translations_str
                })
                
        except Exception as e:
            print(f"Error parsing row: {e}")
    
    # Create a new DataFrame with parsed results
    return pl.DataFrame(results)

# Create a more readable DataFrame
readable_results = parse_translation_results(processed_df)
readable_results.write_excel("test15Groq_parsed.xlsx")
readable_results

# For a specific row (e.g., the first row)
#row_json = processed_df[2, 'translation_result']
#parsed_json = json.loads(row_json)

# Print in readable format
#import pprint
#pprint.pprint(parsed_json, width=100, sort_dicts=False)

### Expand df with LLM responses

In [None]:
def expand_translation_results(df):
    """Parse JSON results and expand them into new columns"""
    # First convert JSON strings to dictionaries
    parsed_rows = []
    
    for row in df.iter_rows(named=True):
        try:
            row_dict = dict(row)
            translation = json.loads(row['translation_result'])
            
            # Extract each language's terms
            for lang, terms in translation.get('terms', {}).items():
                # Join terms with comma for display
                row_dict[lang] = ", ".join(terms)
                
                # Also keep individual terms if needed
                for i, term in enumerate(terms, 1):
                    row_dict[f"{lang}_{i}"] = term
            
            parsed_rows.append(row_dict)
        except Exception as e:
            print(f"Error parsing row: {e}")
            parsed_rows.append(row)
    
    # Create a new DataFrame with expanded columns
    return pl.DataFrame(parsed_rows)

# Create expanded DataFrame
expanded_df = expand_translation_results(processed_df)

# Export to excel
expanded_df.write_excel("test7LMoutput_expanded.xlsx")
expanded_df