Libraries

In [None]:
# Libraries
import numpy as np
import sys
import os
import json
from pathlib import Path
import re
import litellm 
from pydantic import BaseModel
import pandas as pd

# add path to the dataset entities
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

Topic

In [None]:
potential_tokens_folder_output = "adapted" # output
potential_tokens_folder_input = "adapted" # input

class Config:
    def __init__(self, lang, n):
        self.lang = lang
        self.n = n

all_configs = {
    "ai": Config("en", 10),
    "literature": Config("en", 10),
    "music": Config("en", 10),
    "politics": Config("en", 20),
    "science": Config("en", 20),
    "multinerd_en": Config("en", 20),
    "multinerd_pt": Config("pt", 20),
    "ener": Config("en", 20),
    "lener": Config("pt", 20),
    "neuralshift": Config("pt", 20)
}

Prompt

In [None]:
# Prepare LLM environment
os.environ["AZURE_API_KEY"] = "..."
os.environ["AZURE_API_BASE"] = "..."

class LLM_Entity(BaseModel):
  span: str
  entity: str

class LLM_Entity_List(BaseModel):
  reflection: str
  entities: list[LLM_Entity]

# class LLM_Output(BaseModel):
#   candidate_token: str
#   rationale: str
#   updates: str
#   entities: list[LLM_Entity]

In [None]:
def get_prompt_prefix(topic):

    entity_info = ""

    # Entity info = point + span
    point_dict = json.load(open(f"entity_info/point_entities/span/{topic}/train/_point_span_4.json", "r", encoding="utf-8"))
    for entity, clusters in point_dict.items():
        entity_info += f"- \"{entity}\" e.g. {', '.join(clusters)}\n"

    # Split by topic

    # AI
    if topic == "ai":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}

Dates, times, abstract concepts, adjectives and verbs are NOT entities.
Be sure to prioritize more specific entities, such as "researcher" over "person", "conference" over "location" and "university" over "organisation", when it makes sense.
"""
    # LITERATURE
    elif topic == "literature":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}

Dates, times, abstract concepts, adjectives and verbs are NOT entities.
Be sure to prioritize more specific entities, such as "writer" over "person", when it makes sense.
"""
    
    # MUSIC
    elif topic == "music":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}

Dates, times, abstract concepts, adjectives and verbs are NOT entities.
Be sure to prioritize more specific entities, such as "musical artist" over "person" and "band" over "organisation", when it makes sense.
"""

    # POLITICS
    elif topic == "politics":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}

Dates, times, abstract concepts, adjectives and verbs are NOT entities.
Be sure to prioritize more specific entities, such as "politician" over "person" and "political party" over "organisation", when it makes sense.
"""
        
    # SCIENCE
    elif topic == "science":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}

Dates, times, abstract concepts, adjectives and verbs are NOT entities.
Abstract scientific concepts can be entities if they have a name associated with them.
Be sure to prioritize more specific entities, such as "scientist" over "person" and "university" over "organisation" or "location", when it makes sense.
"""

    # MULTINERD PT
    elif topic == "multinerd_pt":
        prompt_prefix = f"""Usa o seguinte conjunto de tipos possíveis de entidade:
{entity_info}

Datas, horas, conceitos abstratos, adjetivos e verbos NÃO são entidades.
Se uma entidade não se encaixar em nenhum dos tipos acima, não a incluas na resposta.
"""
        
    # MULTINERD EN
    elif topic == "multinerd_en":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}
Dates, times, abstract concepts, adjectives and verbs are NOT entities.
"""
        
    # E-NER
    elif topic == "ener":
        prompt_prefix = f"""Use the following set of possible entity labels:
{entity_info}
Dates, times, abstract concepts, adjectives and verbs are NOT entities.
"""
        
    # LeNER-Br + NEURALSHIFT
    elif (topic == "lener" or topic == "neuralshift"):
        prompt_prefix = f"""Usa o seguinte conjunto de tipos possíveis de entidade:
{entity_info}

Conceitos abstratos, adjetivos e verbos NÃO são entidades.
Se uma entidade não se encaixar em nenhum dos tipos acima, não a incluas na resposta.
"""
        
    # validation
    if entity_info == "" or prompt_prefix == "":
        raise ValueError(f"Error retrieving entity info for topic {topic}.")

    return prompt_prefix


################## FINAL PROMPT ##################

def final_prompt(topic, lang, sentence, token, positive_examples):

    positive_examples_string = ""
    for example in positive_examples:
        positive_examples_string += f"Input: {example['sentence']}\nOutput entity: '{example['entity']}'\n\n"

    if lang == "en":
        return f"""
<input_text>
{sentence}
</input_text>

<candidate_token>
{token}
</candidate_token>

1. Evaluate the candidate token "{token}" to determine if it should be categorized as an entity or not.

2. {get_prompt_prefix(topic)}

3. The candidate token is a token that was not recognized as part of an entity by the NER system. However, based on training data, it has a high likelihood of being part of an entity.
Note that the training data might not be representative, therefore use your best judgment to determine whether the token (and surroundings) should be considered as part of a new entity.

4. Consider the following examples provided carefully:
<examples>
{positive_examples_string}
</examples>

Output Format:
- reflection: short reflection on whether the token should be considered part of an entity or not
- entities: list of entities, or empty list if none 

Return a JSON, focusing on the token "{token}" and its surrounding context within the input text:
"{sentence}"
"""
    elif lang == "pt":
        return f"""
<input_text>
{sentence}
</input_text>

<candidate_token>
{token}
</candidate_token>

1. Analisa o token candidato "{token}" para determinar se deve ser considerado parte de uma nova entidade ou não.

2. {get_prompt_prefix(topic)}

3. O token candidato é um token que não foi reconhecido como parte de uma entidade pelo sistema NER. No entanto, com base nos dados de treino, apresenta uma elevada probabilidade de fazer parte de uma entidade. Nota que os dados de treino podem não ser representativos, portanto utiliza o teu melhor julgamento para determinar se o token (e o contexto envolvente) deve ser considerado parte de uma nova entidade.

4. Analisa cuidadosamente os seguintes exemplos:
<examples>
{positive_examples_string}
</examples>

Formato do Output:
- reflection: pequena reflexão sobre se o token deve ser considerado parte de uma nova entidade ou não
- entities: lista de entidades, ou lista vazia se nenhuma 

Return a JSON, focusing on the token "{token}" and its surrounding context within the input text:
"{sentence}"
"""
    
    else:
        raise ValueError(f"Language {lang} not supported.")
    
# - reflection: short reflection on whether the token should be considered an entity or not
# - reflection: breve reflexão sobre se o token deve ser considerado uma entidade ou não


LLM Function

In [None]:
# Call LLM
def safe_llm_call(prompt, system, instance):
    try:

        response = litellm.completion(
            model = "azure/gpt-4o-mini",
            messages = [
                {"role": "system", "content": system},
                {"role": "user", "content": prompt},
            ],

            temperature = 0.1,
            response_format = LLM_Entity_List,

            # stream = False,
            # top_p = 1,
        )

        # extract LLM predictions
        return response.choices[0].message["content"]

    except Exception as e:
        print(f"\n❌❌ LLM call failed: {e}\n")
        print(f"\nExample: {instance}")
        raise

In [None]:
def system_prompt(lang):
    if lang == "en":
        return "You are a named entity recognition (NER) system. Your task is to reflect about either the <candidate_token> is part of an entity mentioned in the <input_text> or not. Always respond with JSON with the outcome of your reflection - new entity or empty."
    elif lang == "pt":
        return "És um sistema de reconhecimento de entidades (NER). A tua tarefa é refletir sobre se o <candidate_token> faz parte de uma entidade mencionada no <input_text> ou não. Responde sempre no formato JSON com o resultado da tua reflexão - nova entidade ou nada."

Call LLM for each token (reflection)

In [None]:
def process_token(topic, lang, file_name, token_info, sentence):

    token = token_info['token']
    positive_examples = token_info['positive_examples']

    prompt = final_prompt(topic, lang, sentence, token, positive_examples)

    # Save prompt to txt file
    prompt_file_path = f"results/error_reflection/{topic}/false_negatives/{potential_tokens_folder_output}/prompts/prompt_{file_name}.txt"
    with open(prompt_file_path, "w", encoding="utf-8") as f:
        f.write(prompt)

    # call the LLM
    try:
        llm_response = safe_llm_call(prompt, system_prompt(lang), sentence)

        if llm_response is None:
            print(f"❌ LLM response is None for sentence: {sentence}")
            return
        
        # Save response to txt file
        response_file_path = f"results/error_reflection/{topic}/false_negatives/{potential_tokens_folder_output}/responses/response_{file_name}.txt"
        with open(response_file_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(llm_response, ensure_ascii=False, indent=4))

        # parse json and return new entities
        llm_json = json.loads(llm_response)
        llm_entities = llm_json.get('entities', [])

        return llm_entities

    except Exception as e:
        print(f"❌ Error on sentence: {e}")
        return

Evaluate all tokens in folder

In [None]:
for topic, config in all_configs.items():
    
    n = config.n
    lang = config.lang
    print(f"\n\nProcessing topic: {topic} (n={n}, lang={lang})")
    
    input_path = Path(f"error_reflection/false_negatives/{topic}/{potential_tokens_folder_input}")

    # ensure folder exists
    os.makedirs(f"results/error_reflection/{topic}/false_negatives/{potential_tokens_folder_output}", exist_ok=True)
    os.makedirs(f"results/error_reflection/{topic}/false_negatives/{potential_tokens_folder_output}/prompts", exist_ok=True)
    os.makedirs(f"results/error_reflection/{topic}/false_negatives/{potential_tokens_folder_output}/responses", exist_ok=True)

    all_data = list(input_path.glob("*.json"))
    for i, file_path in enumerate(all_data):
        
        print(f"\r\tProcessing instance {i+1}/{len(all_data)}", end='', flush=True)

        # load file
        with open(file_path, mode='r', encoding="utf-8") as f:
            data = json.loads(f.read())

        sentence = data["sentence"]
        false_negative_tokens = data["false_negative_tokens"]

        new_entities = []
        for token_info in false_negative_tokens:
            token = token_info["token"]
            reflected_token_entities = process_token(topic, lang, file_path.stem, token_info, sentence)
            
            if reflected_token_entities:
                new_entities.extend(reflected_token_entities)

        # save updated file with new entities
        for entity in new_entities:
            entity["candidate_token"] = token

        # save results to file
        result_json = {
            "sentence": sentence,
            "false_negative_tokens": false_negative_tokens,
            "reflected_entities": new_entities
        }

        result_file_path = f"results/error_reflection/{topic}/false_negatives/{potential_tokens_folder_output}/{file_path.stem}.json"
        with open(result_file_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(result_json, ensure_ascii=False, indent=4))