Libraries

In [None]:
# Libraries
import numpy as np
import sys
import os
import json
from pathlib import Path
import re
import litellm 
from pydantic import BaseModel
import pandas as pd

# add path to the dataset entities
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

Topic

In [None]:
potential_tokens_folder_output = "adapted_noCoT" # output
potential_tokens_folder_input = "adapted" # input
results_category = "boundary"

class Config:
    def __init__(self, lang, n):
        self.lang = lang
        self.n = n

all_configs = {
    "ai": Config("en", 10),
    "literature": Config("en", 10),
    "music": Config("en", 10),
    "politics": Config("en", 20),
    "science": Config("en", 20),
    "multinerd_en": Config("en", 20),
    "multinerd_pt": Config("pt", 20),
    "ener": Config("en", 20),
    "lener": Config("pt", 20),
    "neuralshift": Config("pt", 20)
}

Prompt

In [None]:
# Prepare LLM environment
os.environ["AZURE_API_KEY"] = "..."
os.environ["AZURE_API_BASE"] = "..."

class result(BaseModel):
  span: str
  entity: str

# class LLM_Entity_List(BaseModel):
#   reflection: str
#   result: LLM_Entity

# class LLM_Output(BaseModel):
#   candidate_token: str
#   rationale: str
#   updates: str
#   entities: list[LLM_Entity]

In [None]:
# def get_prompt_prefix(topic):

#     entity_info = ""

#     # Entity info = point + span
#     point_dict = json.load(open(f"entity_info/point_entities/span/{topic}/train/_point_span_4.json", "r", encoding="utf-8"))
#     for entity, clusters in point_dict.items():
#         entity_info += f"- \"{entity}\" e.g. {', '.join(clusters)}\n"

#     # Split by topic

#     # AI
#     if topic == "ai":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}

# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# Be sure to prioritize more specific entities, such as "researcher" over "person", "conference" over "location" and "university" over "organisation", when it makes sense.
# """
#     # LITERATURE
#     elif topic == "literature":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}

# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# Be sure to prioritize more specific entities, such as "writer" over "person", when it makes sense.
# """
    
#     # MUSIC
#     elif topic == "music":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}

# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# Be sure to prioritize more specific entities, such as "musical artist" over "person" and "band" over "organisation", when it makes sense.
# """

#     # POLITICS
#     elif topic == "politics":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}

# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# Be sure to prioritize more specific entities, such as "politician" over "person" and "political party" over "organisation", when it makes sense.
# """
        
#     # SCIENCE
#     elif topic == "science":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}

# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# Abstract scientific concepts can be entities if they have a name associated with them.
# Be sure to prioritize more specific entities, such as "scientist" over "person" and "university" over "organisation" or "location", when it makes sense.
# """

#     # MULTINERD PT
#     elif topic == "multinerd_pt":
#         prompt_prefix = f"""Usa o seguinte conjunto de tipos possíveis de entidade:
# {entity_info}

# Datas, horas, conceitos abstratos, adjetivos e verbos NÃO são entidades.
# Se uma entidade não se encaixar em nenhum dos tipos acima, não a incluas na resposta.
# """
        
#     # MULTINERD EN
#     elif topic == "multinerd_en":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}
# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# """
        
#     # E-NER
#     elif topic == "ener":
#         prompt_prefix = f"""Use the following set of possible entity labels:
# {entity_info}
# Dates, times, abstract concepts, adjectives and verbs are NOT entities.
# """
        
#     # LeNER-Br + NEURALSHIFT
#     elif (topic == "lener" or topic == "neuralshift"):
#         prompt_prefix = f"""Usa o seguinte conjunto de tipos possíveis de entidade:
# {entity_info}

# Conceitos abstratos, adjetivos e verbos NÃO são entidades.
# Se uma entidade não se encaixar em nenhum dos tipos acima, não a incluas na resposta.
# """
        
#     # validation
#     if entity_info == "" or prompt_prefix == "":
#         raise ValueError(f"Error retrieving entity info for topic {topic}.")

#     return prompt_prefix

In [None]:
################## FINAL PROMPT ##################
def final_prompt(lang, sentence, token, boundary, entity_str, examples):

    # validations
    if lang not in ["en", "pt"]:
        raise ValueError(f"Language {lang} not supported.")
    
    if boundary not in ["inside", "outside"]:
        raise ValueError(f"Boundary {boundary} not supported.")

    # example string
    examples_string = ""
    for example in examples:
        examples_string += f"Input: {example['sentence']}\nOutput entity: '{example['entity']}'\n\n"

############# BOUNDARY INSIDE #############

    if boundary == "inside":
        # reflect on token being part of entity when it is probable to be context

        if lang == "en":
            return f"""
<input_text>
{sentence}
</input_text>

<predicted_entity>
{entity_str}
</predicted_entity>

<boundary_token>
{token}
</boundary_token>

<current_status>
part of the entity
</current_status>

1. Calibrate the boundary of the predicted entity by evaluating the inside boundary token "{token}".

2. The boundary token is a token that was recognized as being part of the entity by the NER system. However, based on training data, it has a high likelihood of being part of the context of an entity, but not actually inside the entity. Note that the training data might not be representative, therefore use your best judgment to determine whether the token (and surroundings) should be removed from the entity or not.

3. Consider the following (negative) examples carefully:
<examples>
{examples_string}
</examples>

4. Always output the entity. Either the same as it was given, or a modified version of it (with the new refined boundary)!

5. Always respect the <input_text> structure, classifying continuous spans.

Output Format:
- result: modified entity or original entity if no change is needed 

Return a JSON, focusing on the entity "{entity_str}" and the boundary token "{token}" within the input text:
"{sentence}"
"""
        
        elif lang == "pt":
            return f"""
<input_text>
{sentence}
</input_text>

<predicted_entity>
{entity_str}
</predicted_entity>

<boundary_token>
{token}
</boundary_token>

<current_status>
dentro da entidade
</current_status>

1. Calibra a fronteira da entidade prevista avaliando o token de fronteira "{token}".

2. O token de fronteira é um token que foi reconhecido como parte da entidade pelo sistema NER. No entanto, com base nos dados de treino, ele tem uma alta probabilidade de fazer parte do contexto (à volta) de uma entidade, mas não realmente dentro da entidade. Nota que os dados de treino podem não ser representativos, portanto usa o teu melhor julgamento para determinar se o token (e arredores) deve ser removido da entidade ou não.

3. Considera cuidadosamente os seguintes exemplos (negativos):
<examples>
{examples_string}
</examples>

4. Devolve sempre a entidade. Quer seja a mesma que foi dada, ou uma versão modificada (com a nova fronteira refinada)!

5. Respeita sempre a estrutura do <input_text>, classificando spans de forma contínua.

Formato de Output:
- result: entidade modificada ou a original se nenhuma alteração for necessária

Devolve um JSON, focando-te na entidade "{entity_str}" e no token de fronteira "{token}" dentro do texto de input:
"{sentence}"
"""

############# BOUNDARY OUTSIDE #############

    elif boundary == "outside":

        if lang == "en":
            return f"""
<input_text>
{sentence}
</input_text>

<predicted_entity>
{entity_str}
</predicted_entity>

<boundary_token>
{token}
</boundary_token>

<current_status>
outside of the entity
</current_status>

1. Calibrate the boundary of the predicted entity by evaluating the context (outside) boundary token "{token}".

2. The boundary token is a token that was not recognized as being part of the entity by the NER system. However, based on training data, it has a high likelihood of being part of the entity, but could have been left out. Note that the training data might not be representative, therefore use your best judgment to determine whether the token (and surroundings) should be added to the entity or not.

3. Consider the following (positive) examples carefully:
<examples>
{examples_string}
</examples>

4. Always output the entity. Either the same as it was given, or a modified version of it (with the new refined boundary)!

5. Always respect the <input_text> structure, classifying continuous spans.

Output Format:
- result: modified entity or original entity if no change is needed 

Return a JSON, focusing on the entity "{entity_str}" and the boundary token "{token}" within the input text:
"{sentence}"
"""
        
        elif lang == "pt":
            return f"""
<input_text>
{sentence}
</input_text>

<predicted_entity>
{entity_str}
</predicted_entity>

<boundary_token>
{token}
</boundary_token>

<current_status>
fora da entidade
</current_status>

1. Calibra a fronteira da entidade prevista avaliando o token (externo) de fronteira "{token}".

2. O token de fronteira é um token que não foi reconhecido como parte da entidade pelo sistema NER. No entanto, com base nos dados de treino, ele tem uma alta probabilidade de fazer parte da entidade, mas pode ter sido deixado de fora. Nota que os dados de treino podem não ser representativos, portanto usa o teu melhor julgamento para determinar se o token (e arredores) deve ser adicionado à entidade ou não.

3. Considera cuidadosamente os seguintes exemplos (positivos):
<examples>
{examples_string}
</examples>

4. Devolve sempre a entidade. Quer seja a mesma que foi dada, ou uma versão modificada (com a nova fronteira refinada)!

5. Respeita sempre a estrutura do <input_text>, classificando spans de forma contínua.

Output Format:
- result: entidade modificada ou a original se nenhuma alteração for necessária

Devolve um JSON, focando-te na entidade "{entity_str}" e no token de fronteira "{token}" dentro do texto de input:
"{sentence}"
"""

LLM Function

In [None]:
# Call LLM
def safe_llm_call(prompt, system, instance):
    try:

        response = litellm.completion(
            model = "azure/gpt-4o-mini",
            messages = [
                {"role": "system", "content": system},
                {"role": "user", "content": prompt},
            ],

            temperature = 0.1,
            response_format = result,

            # stream = False,
            # top_p = 1,
        )

        # extract LLM predictions
        return response.choices[0].message["content"]

    except Exception as e:
        print(f"\n❌❌ LLM call failed: {e}\n")
        print(f"\nExample: {instance}")
        raise

In [None]:
def system_prompt(lang):
    if lang == "en":
        return "You are a named entity recognition (NER) system. Your task is to refine the boundaries of entities mentioned in the <input_text>, taking into account the <boundary_token> provided. Always output an entity with refined boundaries (or the original entity if no change is needed). Respond with JSON."
    elif lang == "pt":
        return "És um sistema de reconhecimento de entidades (NER). A tua tarefa é refinar as fronteiras das entidades mencionadas no <input_text>, tendo em conta o <boundary_token> fornecido. Devolve sempre uma entidade com fronteiras refinadas (ou a entidade original se nenhuma alteração for necessária). Responde com JSON."

Call LLM for each token (reflection)

In [None]:
def process_token(topic, lang, file_name, token_info, sentence):

    token = token_info['token']
    entity_str = token_info['entity_str']
    boundary = token_info['boundary']
    examples = token_info['examples']

    prompt = final_prompt(lang, sentence, token, boundary, entity_str, examples)

    # Save prompt to txt file
    prompt_file_path = f"results/error_reflection/{topic}/{results_category}/{potential_tokens_folder_output}/prompts/prompt_{file_name}.txt"
    with open(prompt_file_path, "w", encoding="utf-8") as f:
        f.write(prompt)

    # call the LLM
    try:
        llm_response = safe_llm_call(prompt, system_prompt(lang), sentence)

        if llm_response is None:
            print(f"❌ LLM response is None for sentence: {sentence}")
            return
        
        # Save response to txt file
        response_file_path = f"results/error_reflection/{topic}/{results_category}/{potential_tokens_folder_output}/responses/response_{file_name}.txt"
        with open(response_file_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(llm_response, ensure_ascii=False, indent=4))

        # parse json and return new entities
        llm_json = json.loads(llm_response)
        new_llm_entity = llm_json

        return {
            "original_entity_str": entity_str,
            "new_llm_entity": new_llm_entity
        }

    except Exception as e:
        print(f"❌ Error on sentence: {e}")
        return

Evaluate all tokens in folder

In [None]:
for topic, config in all_configs.items():
    
    n = config.n
    lang = config.lang
    print(f"\n\nProcessing topic: {topic} (n={n}, lang={lang})")
    
    input_path = Path(f"error_reflection/{results_category}/{topic}/{potential_tokens_folder_input}")

    # ensure folder exists
    os.makedirs(f"results/error_reflection/{topic}/{results_category}/{potential_tokens_folder_output}", exist_ok=True)
    os.makedirs(f"results/error_reflection/{topic}/{results_category}/{potential_tokens_folder_output}/prompts", exist_ok=True)
    os.makedirs(f"results/error_reflection/{topic}/{results_category}/{potential_tokens_folder_output}/responses", exist_ok=True)

    all_data = list(input_path.glob("*.json"))
    for i, file_path in enumerate(all_data):
        
        print(f"\r\tProcessing instance {i+1}/{len(all_data)}", end='', flush=True)

        # load file
        with open(file_path, mode='r', encoding="utf-8") as f:
            data = json.loads(f.read())

        sentence = data["sentence"]
        inside_boundary_tokens = data["inside_boundary_tokens"]
        outside_boundary_tokens = data["outside_boundary_tokens"]

        new_entities = []
        for token_info in inside_boundary_tokens + outside_boundary_tokens:
            reflected_token_entities = process_token(topic, lang, file_path.stem, token_info, sentence)
            if reflected_token_entities:
                new_entities.append(reflected_token_entities)

        # save updated file with new entities
        for entity in new_entities:
            entity["boundary_token"] = token_info['token']
            entity["boundary"] = token_info['boundary']

        # save results to file
        result_json = {
            "sentence": sentence,
            "reflected_entities": new_entities
        }

        result_file_path = f"results/error_reflection/{topic}/{results_category}/{potential_tokens_folder_output}/{file_path.stem}.json"
        with open(result_file_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(result_json, ensure_ascii=False, indent=4))