In [None]:
from openai import OpenAI
import json
import re
from datasets import load_dataset
from dotenv import load_dotenv
import os
import random as rand

In [None]:
load_dotenv()

key = os.getenv("API_KEY")
client = OpenAI(api_key=key, base_url="https://api.deepseek.com")

In [None]:
dataset = load_dataset("eriktks/conll2003")
ner_dict = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [None]:
def get_ner_label(value, ner_dict):
    for k, v in ner_dict.items():
        if v == value:
            return k
    return None

In [None]:
def send_prompt(prompt):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            #{"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": prompt},
        ],
        stream=False
    )

    return response.choices[0].message.content

### VANILLA METHOD

In [None]:
for sentence in dataset['train']['tokens'][265:350]:
    # prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
    #     Based on the given entity label set, please recognize the named entities in the given text.\n \
    #     Return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
    #     Text: {" ".join(sentence)}"

    prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
    Based on the given entity label set, please recognize the named entities in the given text.\n \
    Return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
    Text: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    # print(answer)
    with open("vanilla_v3_250.txt", "a") as file:
        file.write(f"{answer}\n")


### ROLE PLAY METHOD

In [None]:
for sentence in dataset['train']['tokens'][100:350]:
    prompt = f"You are a linguist expert professor. \
        You have a PHD and a post-doc in name entity recognition and you have been working with this in the past 20 years.\
        You are the best in the world at this task. You are creating a new NER dataset and you are labeling some sentences tokens.\
        Based on the given entity label set: {list(ner_dict.keys())}, \
        you are going to recognize the named entities in the given sentence.\n \
        You are going to return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
        Sentence: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    # print(answer)
    with open("role_play_250.txt", "a", encoding="utf-8") as file:
        file.write(f"{answer}\n")

### CHAIN OF THOUGHT

In [None]:
for sentence in dataset['train']['tokens'][100:350]:
    prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
        Based on the given entity label set, please recognize the named entities in the given text and show your reasoning step by step for how you identify each entity.\n \
        You must conclude your reasoning by returning a list of tuples with each token and its label. \
        At the left and right of the list insert a '$' symbol. For example $[('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]$\n \
        You must return the list with the shown format only one time and no more. \
        Text: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    # print(answer)
    with open("COT_250.txt", "a", encoding="utf-8") as file:
        file.write(f"{answer}\n")

### ROLE PLAY METHOD + CHAIN OF THOUGHT

In [None]:
for sentence in dataset['train']['tokens'][100:350]:
    prompt = f"You are a linguist expert professor. \
        You have a PHD and a post-doc in name entity recognition and you have been working with this in the past 20 years.\
        You are the best in the world at this task. You are creating a new NER dataset and you are labeling some sentences tokens.\
        Based on the given entity label set: {list(ner_dict.keys())}, \
        you are going to recognize the named entities in the given sentence and show your reasoning step by step for how you identify each entity.\n \
        You must conclude your reasoning by returning a list of tuples with each token and its label. \
        At the left and right of the list insert a '$' symbol. For example $[('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]$\n \
        You must return the list with the shown format only one time and not more. \
        The sentence is: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    print(prompt)
    with open("role_playand_COT_250.txt", "a", encoding="utf-8") as file:
        file.write(f"{answer}\n")

### Divided inputs


In [None]:
messages = []

def send_prompt_conversation(prompt):
    messages.append({"role": "user", "content": prompt})
    response = client.chat.completions.create(
        model="deepseek-chat", 
        messages=messages
    )
    reply = response.choices[0].message.content
    #print("Bot:", reply)
    messages.append({"role": "assistant", "content": reply})

for sentence in dataset['train']['tokens'][100:350]:
    prompt = f"Take as input the sentence: \n  {' '.join(sentence)}  \n \
        --Break it down into its tokens and return them as a list. \
        The tokens will be used for a named entity recognition task.\n"

    send_prompt_conversation(prompt)

    prompt_2 = f"Given the entity label set: {list(ner_dict.keys())},\n \
                based on the tokens you found in the previous step, \
                please recognize the named entities in the given text and return a list of tuples with each token and its label. \
                Return only the list in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]  \
                Do not return any explanation or additional text.\n."
    
    send_prompt_conversation(prompt_2)

    with open("divided_inputs_250.txt", "a") as file:
        file.write(f"{messages[3]["content"]}\n")
    
    messages = []


### Self validating

In [None]:
messages = []

for sentence in dataset['train']['tokens'][100:350]:
    prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
        Based on the given entity label set, please recognize the named entities in the given text.\n \
        Return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
        Text: \n {" ".join(sentence)}"

    send_prompt_conversation(prompt)

    prompt_2 = f"Please carefully review your previous answer and correct any possible mistakes. \
    Keep the same output structure: a list of tuples in this format: \
    [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...] \
    Do not return any explanation or additional text.\n."   
    send_prompt_conversation(prompt_2)

    with open("self_validating_250.txt", "a") as file:
        file.write(f"{messages[3]["content"]}\n")
    
    messages = []

### EVALUATION

In [None]:
# Evaluation of results
import ast
from sklearn.metrics import precision_score, recall_score, f1_score

def parse_prediction(pred_str, ner_dict):
    pred_list = ast.literal_eval(pred_str)
    tokens = []
    label_ids = []
    for token, label in pred_list:
        tokens.append(token)
        label_ids.append(ner_dict.get(label, 0))
    return tokens, label_ids
        
def f1Score(file_path, dataset, dict=ner_dict):
    """
    Calculate the F1 score for the predictions in the file.
    """

    label_list = list(set(dict.values()))
    
    with open(file_path, 'r') as f:
        content = f.read()
    
    predictions = re.findall(r'(\[.*?\])', content, re.DOTALL)
    # predictions = []
    # for pred in predictions_with_duplicates:
    #     if pred not in predictions:
    #         predictions.append(pred)
    y_pred = []
    y_true = dataset['train']['ner_tags'][:len(predictions)]

    for i, pred in enumerate(predictions):

        pred_tokens, pred_labels = parse_prediction(pred.strip(), ner_dict)
        # print(f"Pred tokens: {pred_tokens}")
        # print(f"Pred labels: {pred_labels}")
        true_tokens = dataset['train']['tokens'][i]
        true_labels = dataset['train']['ner_tags'][i]
        # print(f"True tokens: {true_tokens}")
        # print(f"True labels: {true_labels}")

        aligned_preds = []
        pred_idx = 0

        if len(pred_labels) != len(true_labels):
            for true_token in true_tokens:
                if pred_idx < len(pred_tokens) and pred_tokens[pred_idx] == true_token:
                    aligned_preds.append(pred_labels[pred_idx])
                    pred_idx += 1
                else:
                    aligned_preds.append(-1)
        else:
            aligned_preds = pred_labels[:]
    
        y_pred.append(aligned_preds)


    flat_true = [label for seq in y_true for label in seq]
    flat_pred = [label for seq in y_pred for label in seq]
    print(f"Flat true: {flat_true}")
    print(f"Flat pred: {flat_pred}")
    
    precision = precision_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    recall = recall_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    f1 = f1_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)

    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    return f1

def f1Score_dollar(file_path, dataset, dict=ner_dict):
    """
    Calculate the F1 score for the predictions in the file,
    extracting only the string between two '$' symbols.
    """
    
    label_list = list(set(dict.values()))
    
    with open(file_path, 'r') as f:
        content = f.read()
    
    predictions = re.findall(r'\$(\[.*?\])\$', content, re.DOTALL)
    # predictions = []
    # for pred in predictions_with_duplicates:
    #     if pred not in predictions:
    #         predictions.append(pred)
    y_pred = []
    y_true = dataset['train']['ner_tags'][:len(predictions)]

    for i, pred in enumerate(predictions):

        pred_tokens, pred_labels = parse_prediction(pred.strip(), ner_dict)
        # print(f"Pred tokens: {pred_tokens}")
        # print(f"Pred labels: {pred_labels}")
        true_tokens = dataset['train']['tokens'][i]
        true_labels = dataset['train']['ner_tags'][i]
        # print(f"True tokens: {true_tokens}")
        # print(f"True labels: {true_labels}")

        aligned_preds = []
        pred_idx = 0

        if len(pred_labels) != len(true_labels):
            for true_token in true_tokens:
                if pred_idx < len(pred_tokens) and pred_tokens[pred_idx] == true_token:
                    aligned_preds.append(pred_labels[pred_idx])
                    pred_idx += 1
                else:
                    aligned_preds.append(-1)
        else:
            aligned_preds = pred_labels[:]
    
        y_pred.append(aligned_preds)


    flat_true = [label for seq in y_true for label in seq]
    flat_pred = [label for seq in y_pred for label in seq]
    print(f"Flat true: {flat_true}")
    print(f"Flat pred: {flat_pred}")
    
    precision = precision_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    recall = recall_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    f1 = f1_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)

    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    return f1


In [None]:
# f1Score("role_play.txt", dataset, ner_dict)
f1Score("vanilla_v3.txt", dataset, ner_dict)
# f1Score_dollar("COT.txt", dataset, ner_dict)
# f1Score_dollar("role_playand_COT.txt", dataset, ner_dict)
#f1Score("divided_inputs.txt", dataset, ner_dict)
f1Score("self_validating.txt", dataset, ner_dict)