# DOC or PDF?
Test best conversion output into markdown

In [1]:
!pip install python-docx pypandoc pymupdf4llm html2text markdown2

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pypandoc
  Downloading pypandoc-1.15-py3-none-any.whl.metadata (16 kB)
Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.17-py3-none-any.whl.metadata (4.1 kB)
Collecting html2text
  Downloading html2text-2024.2.26.tar.gz (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting markdown2
  Downloading markdown2-2.5.3-py3-none-any.whl.metadata (2.1 kB)
Collecting pymupdf>=1.24.10 (from pymupdf4llm)
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypandoc-1.15-py3-none-any.whl (21 kB)
Downloading pymupd

In [None]:
def convert_word_to_markdown(file_path) -> str:
    from docx import Document
    import markdown

    # Read the Word document
    doc = Document(file_path)
    markdown_content = []

    for para in doc.paragraphs:
        markdown_content.append(para.text)

    # Join the content into a single Markdown string
    return '\n\n'.join(markdown_content)


def convert_pdf_to_markdown(source) -> str:
    """
    Convert a PDF to markdown text, supporting both local files and URLs

    Args:
        source: Local file path or URL to a PDF document

    Returns:
        Markdown content as a string
    """
    import pymupdf4llm
    import requests
    import tempfile
    import os

    # Check if the source is a URL
    if source.startswith('http://') or source.startswith('https://'):
        try:
            # Download the PDF from the URL
            response = requests.get(source, stream=True)

            # Check if the response is a PDF
            if response.headers.get('content-type') != 'application/pdf':
                raise ValueError(f"The URL did not return a PDF document: {source}")

            # Create a temporary file
            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
                temp_path = temp_file.name
                # Write the PDF content to the temporary file
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        temp_file.write(chunk)

            # Process the temporary file
            try:
                markdown_content = pymupdf4llm.to_markdown(temp_path)
                return markdown_content
            finally:
                # Clean up the temporary file
                if os.path.exists(temp_path):
                    os.remove(temp_path)

        except Exception as e:
            raise Exception(f"Failed to process PDF from URL: {str(e)}")

    else:
        # Process a local file
        return pymupdf4llm.to_markdown(source)

# UNLibrary

## 1- Search of a term filtered by Document symbol

In [2]:
!pip install requests beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup

def access_un_library_by_term_and_symbol(term, document_symbol) -> str:
    """
    Access the UN Digital Library and search for documents by term and document symbol.

    Args:
        term (str): The search term to look for in the full text.
        document_symbol (str): The document symbol to filter the search results.

    Returns:
        str: The HTML content of the search results page if the request is successful, None otherwise.
    """
    try:
        # Base URL
        base_url = "https://digitallibrary.un.org/search?"

        # Construct the URL with the provided term and document symbol
        url = (
            f"{base_url}ln=en&as=1&m1=p&p1={document_symbol}&f1=documentsymbol&op1=a"
            f"&m2=p&p2={term}&f2=fulltext&op2=a&rm=&sf=title&so=a&rg=50"
            f"&c=United+Nations+Digital+Library+System&of=hb&fti=1"
        )

        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            print("Request was successful. Content:")

            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Return the HTML content
            return soup.prettify()
        else:
            print(f"Failed to retrieve the URL. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

# Example usage
#term = "plastic pollution"
#document_symbol = "UNEP/EA"
#html_output = access_un_library_by_term_and_symbol(term, document_symbol)
#if html_output:
    #print(html_output)

In [None]:
import json
import urllib.parse
import base64

def adv_search_un_library(document_symbol=None, fulltext_term=None, date_from=None, date_to=None):
    """
    Build a search URL for the UN Digital Library

    Args:
        document_symbol: Document symbol or symbols (can be a string or list)
        fulltext_term: Term to search in full text
        date_from: Start date in YYYY-MM-DD format
        date_to: End date in YYYY-MM-DD format

    Returns:
        Search URL for the UN Digital Library
    """
    # Base URL
    base_url = "https://digitallibrary.un.org/search?"

    # Create the search query structure
    query = {
        "date_selector": {
            "dateType": "creation_date",
            "datePeriod": "specificdateperiod",
            "dateFrom": date_from or "2000-01-01",
            "dateTo": date_to or "2025-02-17"
        },
        "clauses": []
    }

    # Add document symbol search if provided
    if document_symbol:
        if isinstance(document_symbol, list):
            doc_symbols = " ".join(document_symbol)
        else:
            doc_symbols = document_symbol

        query["clauses"].append({
            "searchIn": "documentsymbol",
            "contain": "any-words",
            "term": doc_symbols,
            "operator": "AND"
        })

    # Add fulltext search if provided
    if fulltext_term:
        query["clauses"].append({
            "searchIn": "fulltext",
            "contain": "phrase-match",
            "term": fulltext_term,
            "operator": "AND"
        })

    # Convert query to JSON and then URL encode it (only once)
    query_json = json.dumps(query)
    encoded_query = urllib.parse.quote(query_json)
    #encoded_query = urllib.parse.quote(encoded_query)
    encoded_query = base64.b64encode(encoded_query.encode()).decode()

    # Build parameters manually to ensure correct format
    params = [
        ("ln", "en"),
        ("as", "1"),
        ("so", "d"),
        ("rg", "50"),
        ("c", "Resource Type"),  # Note: space, not +
        ("c", "UN Bodies"),      # Separate parameter
        ("of", "hb"),
        ("fti", "1"),
        ("fti", "1"),            # Repeated parameter
        ("as_query", encoded_query),
        ("action_search", "placeholder")
    ]

    # Encode each parameter correctly
    url_parts = []
    for key, value in params:
        # Don't encode as_query again as it's already encoded
        if key == "as_query":
            url_parts.append(f"{key}={value}")
        else:
            url_parts.append(f"{key}={urllib.parse.quote(value)}")

    url = base_url + "&".join(url_parts) + "#searchresultsbox"

    print(url)
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        print("Request was successful. Content:")

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Return the HTML content
        return soup.prettify()
    else:
        print(f"Failed to retrieve the URL. Status code: {response.status_code}")
        return None

In [None]:
# display HTML output in Google Colab
from IPython.display import display, HTML

display(HTML(html_output))

0
"Home  > Search Results: ( (documentsymbol:""UNEP/EA"") AND (fulltext:""plastic pollution""))"

0,1,2
United Nations Digital Library System,30  records found,Search took 0.32 seconds.

0,1
1.0,Multiple Files  UNEP_EA.3_RES.7-AR  3/7. Marine litter and microplastics : resolution / adopted by the United Nations Environment Assembly  UNEP/EA.3/RES.7  |  2018-01-30  |  Resolutions and Decisions  |  Detailed record  -  Similar records
2.0,Multiple Files  UNEP_EA.5_RES.14-AR  5/14. End plastic pollution: towards an international legally binding instrument : resolution / adopted by the United Nations Environment Assembly  UNEP/EA.5/RES.14  |  2022-03-07  |  Resolutions and Decisions  |  Detailed record  -  Similar records
3.0,Multiple Files  UNEP_EA.4_11-AR  Analysis of voluntary commitments targeting marine litter and microplastics pursuant to resolution 3/7 : report of the Executive Director  UNEP/EA.4/11  |  2018-12-21  |  Reports  |  Detailed record  -  Similar records
4.0,"PDF  UNEP_EA.3_INF_5-EN  Combating marine plastic litter and microplastics: An assessment of the effectiveness of relevant international, regional and subregional governance s[...]  UNEP/EA.3/INF/5  |  2018-02-15  |  Documents and Publications  |  Detailed record  -  Similar records"
5.0,PDF  UNEP_EA.5_INF_4-EN  Compilation of statements and recommendations by major groups and stakeholders from North America for consideration by the United Nations Environment [...]  UNEP/EA.5/INF/4  |  2021-01-15  |  Documents and Publications  |  Detailed record  -  Similar records
6.0,"Multiple Files  UNEP_EA.5_L.22-AR  Draft resolution on a framework for addressing plastic product pollution, including single-use plastic product pollution (version of 28 January 2022) [...]  UNEP/EA.5/L.22  |  2022-01-27  |  Draft Resolutions and Decisions  |  Detailed record  -  Similar records"
7.0,Multiple Files  UNEP_EA.5_L.23-AR  Draft resolution on an international legally binding instrument on [marine] plastic pollution : draft resolution / UNEP. Environment Assembly  UNEP/EA.5/L.23  |  2022-02-16  |  Draft Resolutions and Decisions  |  Detailed record  -  Similar records
8.0,Multiple Files  UNEP_EA.5_L.6-AR  Draft resolution on an internationally legally binding instrument on plastic pollution (version submitted on 10 January 2022) : draft resolution / Per[...]  UNEP/EA.5/L.6  |  2022-01-27  |  Draft Resolutions and Decisions  |  Detailed record  -  Similar records
9.0,Multiple Files  UNEP_EA.3_L.20-AR  Draft resolution on marine litter and microplastics  UNEP/EA.3/L.20  |  2017-12-05  |  Draft Resolutions and Decisions  |  Detailed record  -  Similar records
10.0,"Multiple Files  UNEP_EA.5_3_REV.1-AR  For people and planet: the United Nations Environment Programme strategy for 2022–2025 to tackle climate change, loss of nature and pollution  UNEP/EA.5/3/REV.1  |  2021-02-17  |  Documents and Publications  |  Detailed record  -  Similar records"

Interested in being notified about new results for this query?
Set up a personal  email alert  or subscribe to the  RSS feed  .


## 2- Extract the result's docs

In [None]:
def extract_document_symbols(html_content) -> list:
    """
    Extract document symbols from the given HTML content.

    Args:
        html_content (str): The HTML content of the search results page.

    Returns:
        list: A list of extracted document symbols.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    document_symbols = []

    # Find all div elements with class 'brief-options'
    for div in soup.find_all('div', class_='brief-options'):
        # Find the first <i> tag with class 'fa-globe' and get the next sibling text
        globe_icon = div.find('i', class_='fa-globe')
        if globe_icon:
            document_symbol = globe_icon.next_sibling.strip()
            document_symbols.append(document_symbol)

    return document_symbols

#extracted_symbols = extract_document_symbols(html_output)
#print(extracted_symbols)

In [None]:
import requests
from bs4 import BeautifulSoup

def cleanSymbols(input_dict, removeDrafts=False, maxResults=3) -> list:
    """
    Cleans the docSymbol strings in the input dictionary by removing whitespace within parentheses,
    keeping the last part if there is a ' - ', and optionally removing items with
    docType containing "draft". Processing stops when maxResults valid items have been added.

    Args:
        input_dict (list of dict): A list of dictionaries containing metadata with docSymbol strings.
        removeDrafts (bool): Whether to remove items with docType containing "draft".
        maxResults (int): Maximum number of cleaned items to return.

    Returns:
        list of dict: A list of dictionaries with cleaned docSymbol strings (up to maxResults items).
    """
    cleaned_dict = []
    modified_count = 0
    spaces_count = 0
    hyphen_count = 0
    removed_count = 0

    if int(maxResults) > 50:
        maxResults = 50

    for item in input_dict:
        # If removeDrafts is True, skip items with 'draft' in docType
        if removeDrafts and 'draft' in item['docType'].lower():
            removed_count += 1
            continue

        original_doc_symbol = item['docSymbol']
        doc_symbol = original_doc_symbol

        # If ' - ' is present, keep the last part
        if ' - ' in doc_symbol:
            doc_symbol = doc_symbol.split(' - ')[-1]
            hyphen_count += 1

        # Remove whitespace within parentheses
        if ' (' in doc_symbol or ') ' in doc_symbol:
            doc_symbol = doc_symbol.replace(' (', '(').replace(') ', ')')
            spaces_count += 1

        if doc_symbol != original_doc_symbol:
            modified_count += 1

        item['docSymbol'] = doc_symbol

        cleaned_dict.append(item)

        # Stop processing if we reached the maxResults count
        if len(cleaned_dict) >= maxResults:
            break

    print(f"Modified {modified_count} out of {len(input_dict)} symbols. Removed whitespaces from {spaces_count} and hyphens from {hyphen_count}. Removed {removed_count} items with 'draft' in docType.")
    return cleaned_dict

# Example usage
input_data = [
    {'docSymbol': 'UNEP/EA.2/RES.10', 'publicationDate': '2016-08-04', 'docType': 'Resolutions and Decisions', 'docTitle': '2/10. Oceans and seas : resolution / adopted by the e United Nations Environment Assembly', 'isMultiple': True},
    {'docSymbol': 'UNEP/EA.2/RES.11', 'publicationDate': '2016-08-04', 'docType': 'Resolutions and Decisions', 'docTitle': '2/11. Marine plastic litter and microplastics : resolution / adopted by the e United Nations Environment Assembly', 'isMultiple': True},
    {'docSymbol': 'UNEP/EA.2/RES.5', 'publicationDate': '2016-08-03', 'docType': 'Draft Resolutions and Decisions', 'docTitle': '2/5. Delivering on the 2030 Agenda for Sustainable  Development : resolution / adopted by the Unit', 'isMultiple': True},
    {'docSymbol': 'UNEP/EA.4/RES.5', 'publicationDate': '2016-08-03', 'docType': 'Draft Resolutions and Decisions', 'docTitle': '2/5. Delivering on the 2030 Agenda for Sustainable  Development : resolution / adopted by the Unit', 'isMultiple': True},
    {'docSymbol': 'UNEP/EA.5/RES.5', 'publicationDate': '2016-08-03', 'docType': 'Draft Resolutions and Decisions', 'docTitle': '2/5. Delivering on the 2030 Agenda for Sustainable  Development : resolution / adopted by the Unit', 'isMultiple': True},
    {'docSymbol': 'UNEP/EA.3/RES.5', 'publicationDate': '2016-08-03', 'docType': 'Draft Resolutions and Decisions', 'docTitle': '2/5. Delivering on the 2030 Agenda for Sustainable  Development : resolution / adopted by the Unit', 'isMultiple': True},
    {'docSymbol': 'UNEP/EA.6/RES.5', 'publicationDate': '2016-08-03', 'docType': 'Draft Resolutions and Decisions', 'docTitle': '2/5. Delivering on the 2030 Agenda for Sustainable  Development : resolution / adopted by the Unit', 'isMultiple': True}
]

test_cleaned_data = cleanSymbols(input_data, removeDrafts=True, maxResults=2)
print(test_cleaned_data)

def extract_metadata_UNLib(html_content) -> list :
    soup = BeautifulSoup(html_content, 'html.parser')
    metadata_list = []

    # Find all div elements with class 'result-row'
    for div in soup.find_all('div', class_='result-row'):
        metadata = {}

        # Extract document symbol
        globe_icon = div.find('i', class_='fa-globe')
        if globe_icon:
            metadata['docSymbol'] = globe_icon.next_sibling.strip()

        # Extract publication date
        calendar_icon = div.find('i', class_='fa-calendar')
        if calendar_icon:
            metadata['publicationDate'] = calendar_icon.next_sibling.strip()

        # Extract document type
        tag_icon = div.find('i', class_='fa-tag')
        if tag_icon:
            metadata['docType'] = tag_icon.next_sibling.strip()

        # Extract document title
        result_title = div.find('div', class_='result-title')
        if result_title and result_title.find('a'):
            metadata['docTitle'] = result_title.find('a').text.strip()

        # Check if there are multiple files
        file_area = div.find('div', class_='file-area')
        #print(repr(file_area.text))  # Print the file-area div for debugging
        if file_area and 'Multiple Files' in file_area.get_text():
            metadata['isMultiple'] = True
        else:
            metadata['isMultiple'] = False

        metadata_list.append(metadata)

    return metadata_list

# Example usage
#term = "plastic pollution"
#document_symbol = "UNEP/EA"
#html_output = access_un_library_by_term_and_symbol(term, document_symbol)
#if html_output:
    #metadata = extract_metadata_UNLib(html_output)
    #print(metadata)
    #metadata = cleanSymbols(metadata, removeDrafts=True)
    #print(metadata)

Modified 0 out of 7 symbols. Removed whitespaces from 0 and hyphens from 0. Removed 0 items with 'draft' in docType.
[{'docSymbol': 'UNEP/EA.2/RES.10', 'publicationDate': '2016-08-04', 'docType': 'Resolutions and Decisions', 'docTitle': '2/10. Oceans and seas : resolution / adopted by the e United Nations Environment Assembly', 'isMultiple': True}, {'docSymbol': 'UNEP/EA.2/RES.11', 'publicationDate': '2016-08-04', 'docType': 'Resolutions and Decisions', 'docTitle': '2/11. Marine plastic litter and microplastics : resolution / adopted by the e United Nations Environment Assembly', 'isMultiple': True}]


In [None]:
# Convert metadata in a Pandas Dataframe
#import pandas as pd
#UNLib_results_metadata_df = pd.DataFrame(metadata)
#UNLib_results_metadata_df

Unnamed: 0,docSymbol,publicationDate,docType,docTitle,isMultiple
0,UNEP/EA.3/RES.7,2018-01-30,Resolutions and Decisions,3/7. Marine litter and microplastics : resolut...,True
1,UNEP/EA.5/RES.14,2022-03-07,Resolutions and Decisions,5/14. End plastic pollution: towards an intern...,True
2,UNEP/EA.4/11,2018-12-21,Reports,Analysis of voluntary commitments targeting ma...,True
3,UNEP/EA.3/INF/5,2018-02-15,Documents and Publications,Combating marine plastic litter and microplast...,False
4,UNEP/EA.5/INF/4,2021-01-15,Documents and Publications,Compilation of statements and recommendations ...,False
5,UNEP/EA.5/3/REV.1,2021-02-17,Documents and Publications,For people and planet: the United Nations Envi...,True
6,UNEP/EA.5/3,2020-11-11,Documents and Publications,For people and planet: the United Nations Envi...,True
7,UNEP/EA.5/3/ADD.1,2020-12-07,Documents and Publications,For people and planet: the United Nations Envi...,True
8,UNEP/EA.4/16,2019-01-21,Reports,Implementation of paragraph 88 of the outcome ...,True
9,UNEP/EA.4/3,2018-12-21,Documents and Publications,Implementation plan “Towards a Pollution-Free ...,True


✨ Should we remove the Draft Resolutions and Decisions? Added as optional cleanSymbols(input, removeDrafts=False).

## 3- Access the landing page to check if document available in Languages

In [8]:
def get_un_document_urls(document_symbol) -> dict :
    """
    Convert a UN document symbol into downloadable PDF URLs for all official UN languages

    Args:
        document_symbol: Document symbol like 'UNEP/EA.5/HLS.1'

    Returns:
        Dictionary mapping language names to their PDF URLs {'French': "https://..."}
    """
    # Define language codes and their names
    languages = {
        "Arabic": "A",
        "Chinese": "C",
        "English": "E",
        "French": "F",
        "Russian": "R",
        "Spanish": "S"
    }

    # Base URL format
    base_url = "https://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS={}&Lang={}"

    # Generate URLs for all languages
    urls = {}
    for language_name, language_code in languages.items():
        url = base_url.format(document_symbol, language_code)
        urls[language_name] = url

    return urls

# 4- Download PDF and convert to readable text

undocs.org/{lang}/{symbol}

Consider these PDF conversion options:
* Llama-index for RAG purposes: https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/
* pymupdf4llm for quality readable Markdown
* table-friendly converter https://docs.google.com/spreadsheets/d/12IhxHZbYF71dPl32PQpF_6pg9e9S8f9W4sTHt-B0KTg/edit?gid=0#gid=0 (Unstructured, gmft)
* any pdf to xml converter for easier bilingual alignment:

Consider a cleaning function before extraction

### 4A Filter section with match

In [None]:
import re

def find_paragraphs_with_merge(text, search_string, max_paragraphs=1) -> list:
    """
    Search for a string in a long text and return the whole paragraph(s) containing it.
    If the paragraph is split by page numbers or footnotes, merge it back together.

    Args:
        text: The text to search in
        search_string: The string to search for
        max_paragraphs: Maximum number of paragraphs to return (default: 1)

    Returns:
        A list of matched paragraphs
    """
    paragraphs = text.split('\n\n')  # Split text into paragraphs
    matched_paragraphs = []
    found_count = 0

    # Find all paragraphs containing the search string
    for i, paragraph in enumerate(paragraphs):
        if search_string in paragraph:
            matched_paragraph = paragraph

            # Check if there might be a split paragraph
            if i < len(paragraphs) - 1:
                next_paragraph = paragraphs[i + 1]

                # Check if next paragraph is a page number or footnote
                is_page_number = re.match(r'\s*\*\*\d+\*\*\s*', next_paragraph)
                is_footnote = re.match(r'\s*K\d{7}\s\d{6}\s*', next_paragraph)

                # Skip the footnote/page number and check if there's another paragraph after it
                if (is_page_number or is_footnote) and i < len(paragraphs) - 2:
                    # Check for separators (dashes)
                    separator_index = i + 2
                    if separator_index < len(paragraphs) and re.match(r'\s*-+\s*', paragraphs[separator_index]):
                        separator_index += 1

                    # If there's text after the separator, it's likely the continuation of the paragraph
                    if separator_index < len(paragraphs):
                        continuation = paragraphs[separator_index]
                        # Only merge if the continuation doesn't start with a number (which would indicate a new paragraph)
                        if not re.match(r'\s*\d+\.\s', continuation):
                            matched_paragraph = matched_paragraph + " " + continuation

            matched_paragraphs.append(matched_paragraph)
            found_count += 1

            # Stop if we've found the maximum number of paragraphs
            if found_count >= max_paragraphs:
                break

    # Return a single paragraph if max_paragraphs=1 for backward compatibility
    if max_paragraphs == 1:
        return matched_paragraphs[0] if matched_paragraphs else None

    return matched_paragraphs

# Example usage
#searchText = "global decline of biodiversity"

# Find just one paragraph (original behavior)
#result = find_paragraphs_with_merge(EnglishMD, searchText)
#print(result)

# Find multiple paragraphs
#results = find_paragraphs_with_merge(EnglishMD, searchText, max_paragraphs=3)
#for i, paragraph in enumerate(results):
    #print(f"\nParagraph {i+1}:")
    #print(paragraph)

# 5- Bilingual aligner

Reuse my code from https://huggingface.co/spaces/nelsonjq/pdf2tmx/tree/main

In [10]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
# First, install required packages if not already installed
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model_name='sentence-transformers/LaBSE'
model_name="distiluse-base-multilingual-cased-v2"

def find_similar_paragraph_in_target(source_paragraph, target_text, model_name='distiluse-base-multilingual-cased-v2', top_k=1) -> list[tuple[str, float]]:
    """
    Find the most similar paragraph(s) in the target text using multilingual embeddings.

    Args:
        source_paragraph: The source paragraph to match
        target_text: The target text to search in
        model_name: The name of the multilingual sentence embedding model to use
        top_k: Number of matching paragraphs to return

    Returns:
        List of top matching paragraphs from the target text
    """
    # Load model
    model = SentenceTransformer(model_name)

    # Split target text into paragraphs
    target_paragraphs = target_text.split('\n\n')

    # Compute embeddings
    source_embedding = model.encode([source_paragraph])
    target_embeddings = model.encode(target_paragraphs)

    # Compute similarities
    similarities = cosine_similarity(source_embedding, target_embeddings)[0]

    # Get indices of top similar paragraphs
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    # Return top matching paragraphs and their similarity scores
    results = [(target_paragraphs[i], similarities[i]) for i in top_indices]

    return results

#6- Term extraction from aligned files

In [12]:
!pip install -U duckduckgo_search

Collecting duckduckgo_search
  Downloading duckduckgo_search-7.5.0-py3-none-any.whl.metadata (17 kB)
Collecting primp>=0.14.0 (from duckduckgo_search)
  Downloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading duckduckgo_search-7.5.0-py3-none-any.whl (20 kB)
Downloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: primp, duckduckgo_search
Successfully installed duckduckgo_search-7.5.0 primp-0.14.0


In [None]:
from duckduckgo_search import DDGS

def askLLM_term_equivalents(source_term, source_paragraphs, target_paragraphs, source_language, target_language) -> str:
    """
    Query a LLM using DDGS().chat() to extract term equivalents across languages.

    Args:
        source_term: The specific source term to find equivalents for
        source_paragraphs: The source paragraphs (context)
        target_paragraphs: List of target paragraphs or tuples from find_similar_paragraph_in_target
        source_language: Language of the source paragraph (e.g., "English")
        target_language: Language of the target paragraphs (e.g., "Spanish")

    Returns:
        String of the LLM answer with the term equivalents extracted by the LLM: <SOURCETERM>{source_language}</SOURCETERM> = <EQUIVALENTTERM>{target_language}</EQUIVALENTTERM>
    """
    # Format the source paragraphs as a single string
    source_text = "\n\n".join(source_paragraphs) if isinstance(source_paragraphs, list) else source_paragraphs

    # Extract paragraph text from tuples if necessary
    target_texts = []
    for item in target_paragraphs:
        if isinstance(item, tuple):
            # Extract the paragraph text from the tuple
            target_texts.append(item[0])
        else:
            target_texts.append(item)

    # Join the target paragraphs
    target_text = "\n\n".join(target_texts)

    prompt = f"""I need to extract term equivalents of {source_term} between these {source_language} and {target_language} paragraphs.
    Please identify the {target_language} equivalent terms for the {source_language} term: <SOURCETERM>{source_term}</SOURCETERM>, preserving all formatting
    (italics, capitalization, gender, and number).

    {source_language.upper()} PARAGRAPH:
    {source_text}

    {target_language.upper()} PARAGRAPH(S):
    {target_text}

    Please list the equivalents in this format:
    "<SOURCETERM>{source_language}</SOURCETERM>" = "<EQUIVALENTTERM>{target_language}</EQUIVALENTTERM>"

    Answer only with the requested term and its equivalent in a single line.
    Preserve all formatting in both languages.
    """

    try:
        # Query the LLM using DuckDuckGo's chat feature
        response = DDGS().chat(prompt, model='claude-3-haiku')
        return response
    except Exception as e:
        return f"Error extracting term equivalents: {str(e)}"

def getEquivalents_from_response(response) -> list:
    """
    Extract all equivalent terms from the response.

    Args:
        response: The LLM response containing <EQUIVALENTTERM> tags

    Returns:
        List of extracted equivalent terms
    """
    # Use regex to find all occurrences of text between <EQUIVALENTTERM> and </EQUIVALENTTERM>
    pattern = r'<EQUIVALENTTERM>(.*?)</EQUIVALENTTERM>'
    matches = re.findall(pattern, response, re.DOTALL)

    return matches

# MAIN integral function

In [None]:
## MAIN INPUTS


# input search text
input_search_text = "10-Year Framework of Programmes on Sustainable Consumption and Production Patterns"
input_lang = ["Spanish", "French"] #iteration
#input_lang = "ALL"
#input_filterSymbols = ["UNEP/EA", "UNEP/CBD/", "FCCC"] #to be collated in a single search query field
input_filterSymbols = ["UNEP"]
sourcesQuantity = 3
paragraphsPerDoc = 2
eraseDrafts = True

## MAIN FUNCTION

UNEP_LANGUAGES = {"English": "en", "French": "fr", "Spanish": "es", "Chinese": "zh", "Russian": "ru", "Arabic": "ar", "Portuguese": "pt", "Swahili": "sw"}

metadataCleaned = []

# Standardize languages
if input_lang == "ALL":
  input_lang = list(UNEP_LANGUAGES.keys())
if isinstance(input_lang, str) and input_lang in list(UNEP_LANGUAGES.keys()):
    input_lang = [input_lang]

#verify that all input languages are in UNEP_Languages

if isinstance(input_filterSymbols, list):
  html_output = None

  if len(input_filterSymbols) == 1:
    html_output = access_un_library_by_term_and_symbol(
        input_search_text,
        input_filterSymbols[0]
        )
  elif len(input_filterSymbols) > 1:
    html_output = adv_search_un_library(
        document_symbol=input_filterSymbols,
        fulltext_term=input_search_text)

  if len(input_filterSymbols) == 0 or html_output = None:
      print("General term search without filters...")
      html_output = access_un_library_by_term_and_symbol(
          input_search_text,
          ""
          )

if html_output:

    metadata = extract_metadata_UNLib(html_output) #list

    print(metadata)
    if metadata:
      metadataCleaned = cleanSymbols(metadata, removeDrafts=eraseDrafts, maxResults=sourcesQuantity)
      #print(metadataCleaned
      #example [{'docSymbol': 'UNEP/EA.3/RES.7', 'publicationDate': '2018-01-30',
          #'docType': 'Resolutions and Decisions', 'docTitle': '3/7. Marine litter',
          # 'isMultiple': True}, {...]

      print(metadataCleaned)

# get drafts if no more results
  # TO DO?



# get UN Docs URL for each result docSymbol
for resultItem in metadataCleaned:
  resultItem["EnglishTerm"] = input_search_text
  resultItem["docURLs"] = get_un_document_urls(resultItem["docSymbol"]) #dict
  #print(resultItem)

  ######################################
  ### Process files
  ######################################

  ####  English file [sourceLang]
  englishMD = convert_pdf_to_markdown(resultItem["docURLs"]["English"])
  englishParagraphs = find_paragraphs_with_merge(englishMD,
                                                 input_search_text,
                                                 max_paragraphs=paragraphsPerDoc
                                                 )
  if englishParagraphs:

    resultItem["EnglishParagraphs"] = englishParagraphs #list

    ### Other languages

    for targetLang in input_lang:
      langMD = convert_pdf_to_markdown(resultItem["docURLs"][targetLang])
      #print(langMD)[400]

      for engPara in englishParagraphs:
        targetParagraphs = find_similar_paragraph_in_target(engPara,
                                                            langMD,
                                                            model_name='distiluse-base-multilingual-cased-v2',
                                                            top_k=1)
      if targetParagraphs:
          tParaColName = targetLang + 'Paragraphs'
          resultItem[tParaColName] = targetParagraphs #list

          #extract bilingual terms as LLM string answer
          targetTerms = askLLM_term_equivalents(input_search_text,
                                                 englishParagraphs,
                                                 targetParagraphs,
                                                 "English",
                                                 targetLang)
          print(targetTerms)

          targetTerms = getEquivalents_from_response(targetTerms) #list of str
          if targetTerms:
            #unique values of list
            targetTerms = list(set(targetTerms))

            # Save the targetTerm in metadata w/ its related
            targetTermColName = targetLang + 'Term'
            targetSynonymsColName = targetLang + 'Synonyms'
            resultItem[targetTermColName] = targetTerms[0]
            resultItem[targetSynonymsColName] = targetTerms[1:]




In [27]:
for i in metadataCleaned:
  #SpanishTerm
  if 'SpanishTerm' in i:
    print(i['SpanishTerm'])
    print("\t\t"+(" ; ").join(i['SpanishSynonyms'])+"\n\n")
    print(i['FrenchTerm'])
    print("\t\t"+(" ; ").join(i['FrenchSynonyms'])+"\n\n")
  else:
    print(i)

Marco Decenal de Programas sobre Modalidades de Consumo y Producción Sostenibles
		


Cadre décennal de programmation concernant les modes de consommation et de production durables
		


{'docSymbol': 'UNEP/EA.2/RES.11', 'publicationDate': '2016-08-04', 'docType': 'Resolutions and Decisions', 'docTitle': '2/11. Marine plastic litter and microplastics : resolution / adopted by the e United Nations Environment Assembly', 'isMultiple': True, 'EnglishTerm': '10-Year Framework of Programmes on Sustainable Consumption and Production Patterns', 'docURLs': {'Arabic': 'https://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=UNEP/EA.2/RES.11&Lang=A', 'Chinese': 'https://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=UNEP/EA.2/RES.11&Lang=C', 'English': 'https://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=UNEP/EA.2/RES.11&Lang=E', 'French': 'https://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=UNEP/EA.2/RES.11&Lang=F', 'Russian': 'https://daccess-ods.un.org/access.nsf/Get?OpenAgent&DS=UNEP/EA.