In [47]:
from openai import OpenAI
import json
import re
from datasets import load_dataset
from dotenv import load_dotenv
import os
import random as rand

In [48]:
load_dotenv()

key = os.getenv("API_KEY")
client = OpenAI(api_key=key, base_url="https://api.deepseek.com")

In [49]:
dataset = load_dataset("ontonotes/conll2012_ontonotesv5", 'english_v4', trust_remote_code=True)
ner_dict = {
    "O": 0,
    "B-PERSON": 1,
    "I-PERSON": 2,
    "B-NORP": 3,
    "I-NORP": 4,
    "B-FAC": 5,
    "I-FAC": 6,
    "B-ORG": 7,
    "I-ORG": 8,
    "B-GPE": 9,
    "I-GPE": 10,
    "B-LOC": 11,
    "I-LOC": 12,
    "B-PRODUCT": 13,
    "I-PRODUCT": 14,
    "B-DATE": 15,
    "I-DATE": 16,
    "B-TIME": 17,
    "I-TIME": 18,
    "B-PERCENT": 19,
    "I-PERCENT": 20,
    "B-MONEY": 21,
    "I-MONEY": 22,
    "B-QUANTITY": 23,
    "I-QUANTITY": 24,
    "B-ORDINAL": 25,
    "I-ORDINAL": 26,
    "B-CARDINAL": 27,
    "I-CARDINAL": 28,
    "B-EVENT": 29,
    "I-EVENT": 30,
    "B-WORK_OF_ART": 31,
    "I-WORK_OF_ART": 32,
    "B-LAW": 33,
    "I-LAW": 34,
    "B-LANGUAGE": 35,
    "I-LANGUAGE": 36
}


In [50]:
print(dataset["train"][0]["sentences"][1]["words"])
print(dataset["train"][0]["sentences"][1]["named_entities"])

['We', 'respectfully', 'invite', 'you', 'to', 'watch', 'a', 'special', 'edition', 'of', 'Across', 'China', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0]


In [51]:
def get_ner_label(value, ner_dict):
    for k, v in ner_dict.items():
        if v == value:
            return k
    return None

In [52]:
def send_prompt(prompt):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            #{"role": "system", "content": "You are a helpful assistant"},
            {"role": "user", "content": prompt},
        ],
        stream=False
    )

    return response.choices[0].message.content

### VANILLA METHOD

In [None]:
for line in dataset['train'][0]["sentences"][0:100]:
    # prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
    #     Based on the given entity label set, please recognize the named entities in the given text.\n \
    #     Return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
    #     Text: {" ".join(sentence)}"
    sentence = line["words"]
    prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
    Based on the given entity label set, please recognize the named entities in the given text.\n \
    Return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
    Text: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    # print(answer)
    with open("vanilla_v3_nd.txt", "a") as file:
        file.write(f"{answer}\n")


### ROLE PLAY METHOD

In [43]:
for line in dataset['train'][0]["sentences"][0:100]:
    sentence = line["words"]
    prompt = f"You are a linguist expert professor. \
        You have a PHD and a post-doc in name entity recognition and you have been working with this in the past 20 years.\
        You are the best in the world at this task. You are creating a new NER dataset and you are labeling some sentences tokens.\
        Based on the given entity label set: {list(ner_dict.keys())}, \
        you are going to recognize the named entities in the given sentence.\n \
        You are going to return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
        Sentence: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    # print(answer)
    with open("role_play_nd.txt", "a", encoding="utf-8") as file:
        file.write(f"{answer}\n")

### CHAIN OF THOUGHT

In [None]:
for line in dataset['train'][0]["sentences"][0:100]:
    
    sentence = line["words"]
    prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
        Based on the given entity label set, please recognize the named entities in the given text and show your reasoning step by step for how you identify each entity.\n \
        You must conclude your reasoning by returning a list of tuples with each token and its label. \
        At the left and right of the list insert a '$' symbol. For example $[('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]$\n \
        You must return the list with the shown format only one time and no more. \
        Text: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    # print(answer)
    with open("COT_nd.txt", "a", encoding="utf-8") as file:
        file.write(f"{answer}\n")

### ROLE PLAY METHOD + CHAIN OF THOUGHT

In [44]:
for line in dataset['train'][0]["sentences"][0:100]:
    
    sentence = line["words"]
    prompt = f"You are a linguist expert professor. \
        You have a PHD and a post-doc in name entity recognition and you have been working with this in the past 20 years.\
        You are the best in the world at this task. You are creating a new NER dataset and you are labeling some sentences tokens.\
        Based on the given entity label set: {list(ner_dict.keys())}, \
        you are going to recognize the named entities in the given sentence and show your reasoning step by step for how you identify each entity.\n \
        You must conclude your reasoning by returning a list of tuples with each token and its label. \
        At the left and right of the list insert a '$' symbol. For example $[('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]$\n \
        You must return the list with the shown format only one time and not more. \
        The sentence is: \n {" ".join(sentence)}"

    answer = send_prompt(prompt)
    print(prompt)
    with open("role_playand_COT_nd.txt", "a", encoding="utf-8") as file:
        file.write(f"{answer}\n")

You are a linguist expert professor.         You have a PHD and a post-doc in name entity recognition and you have been working with this in the past 20 years.        You are the best in the world at this task. You are creating a new NER dataset and you are labeling some sentences tokens.        Based on the given entity label set: ['O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE', 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT', 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL', 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW', 'B-LANGUAGE', 'I-LANGUAGE'],         you are going to recognize the named entities in the given sentence and show your reasoning step by step for how you identify each entity.
         You must conclude your reasoning by returning a list of tuples with each token and its la

### Divided inputs


In [45]:
messages = []

def send_prompt_conversation(prompt):
    messages.append({"role": "user", "content": prompt})
    response = client.chat.completions.create(
        model="deepseek-chat", 
        messages=messages
    )
    reply = response.choices[0].message.content
    #print("Bot:", reply)
    messages.append({"role": "assistant", "content": reply})

for line in dataset['train'][0]["sentences"][0:100]:
    sentence = line["words"]
    prompt = f"Take as input the sentence: \n  {' '.join(sentence)}  \n \
        --Break it down into its tokens and return them as a list. \
        The tokens will be used for a named entity recognition task.\n"

    send_prompt_conversation(prompt)

    prompt_2 = f"Given the entity label set: {list(ner_dict.keys())},\n \
                based on the tokens you found in the previous step, \
                please recognize the named entities in the given text and return a list of tuples with each token and its label. \
                Return only the list in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]  \
                Do not return any explanation or additional text.\n."
    
    send_prompt_conversation(prompt_2)

    with open("divided_inputs_nd.txt", "a") as file:
        file.write(f"{messages[3]["content"]}\n")
    
    messages = []


### Self validating

In [53]:
messages = []

for line in dataset['train'][0]["sentences"][89:100]:
    sentence = line["words"]
    prompt = f"Given the entity label set: {list(ner_dict.keys())}.\n \
        Based on the given entity label set, please recognize the named entities in the given text.\n \
        Return only a list of tuples with each token and its label without explenation. Your output must be in the format: [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...]; nothing else\n \
        Text: \n {" ".join(sentence)}"

    send_prompt_conversation(prompt)

    prompt_2 = f"Please carefully review your previous answer and correct any possible mistakes. \
    Keep the same output structure: a list of tuples in this format: \
    [('In','O'), ('America','I-LOC'), ('is','O'), ('cold','O'), ...] \
    Do not return any explanation or additional text.\n."   
    send_prompt_conversation(prompt_2)

    with open("self_validating_nd.txt", "a") as file:
        file.write(f"{messages[3]["content"]}\n")
    
    messages = []

### EVALUATION

In [54]:
# Evaluation of results
import ast
from sklearn.metrics import precision_score, recall_score, f1_score

def parse_prediction(pred_str, ner_dict):
    pred_list = ast.literal_eval(pred_str)
    tokens = []
    label_ids = []
    for token, label in pred_list:
        tokens.append(token)
        label_ids.append(ner_dict.get(label, 0))
    return tokens, label_ids

    
def f1Score(file_path, dataset, dict=ner_dict):
    """
    Calculate the F1 score for the predictions in the file.
    """
    
    with open(file_path, 'r') as f:
        content = f.read()
    
    predictions = re.findall(r'(\[.*?\])', content, re.DOTALL)
    # predictions = []
    # for pred in predictions_with_duplicates:
    #     if pred not in predictions:
    #         predictions.append(pred)
    y_pred = []
    sentences = dataset["train"][0]["sentences"]
    y_true = [sentences[i]["named_entities"] for i in range(len(predictions))]

    for i, pred in enumerate(predictions):

        pred_tokens, pred_labels = parse_prediction(pred.strip(), ner_dict)
        # print(f"Pred tokens: {pred_tokens}")
        # print(f"Pred labels: {pred_labels}")
        true_tokens = dataset['train'][0]["sentences"][i]["words"]
        true_labels = dataset['train'][0]["sentences"][i]["named_entities"]
        # print(f"True tokens: {true_tokens}")
        # print(f"True labels: {true_labels}")

        aligned_preds = []
        pred_idx = 0

        if len(pred_labels) != len(true_labels):
            for true_token in true_tokens:
                if pred_idx < len(pred_tokens) and pred_tokens[pred_idx] == true_token:
                    aligned_preds.append(pred_labels[pred_idx])
                    pred_idx += 1
                else:
                    aligned_preds.append(-1)
        else:
            aligned_preds = pred_labels[:]
    
        y_pred.append(aligned_preds)


    flat_true = [label for seq in y_true for label in seq]
    flat_pred = [label for seq in y_pred for label in seq]
    print(f"Flat true: {flat_true}")
    print(f"Flat pred: {flat_pred}")
    
    precision = precision_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    recall = recall_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    f1 = f1_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)

    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    return f1

def f1Score_dollar(file_path, dataset, dict=ner_dict):
    """
    Calculate the F1 score for the predictions in the file,
    extracting only the string between two '$' symbols.
    """
    
    label_list = list(set(dict.values()))
    
    with open(file_path, 'r') as f:
        content = f.read()
    
    predictions = re.findall(r'\$(\[.*?\])\$', content, re.DOTALL)
    # predictions = []
    # for pred in predictions_with_duplicates:
    #     if pred not in predictions:
    #         predictions.append(pred)
    y_pred = []
    sentences = dataset["train"][0]["sentences"]
    y_true = [sentences[i]["named_entities"] for i in range(len(predictions))]

    for i, pred in enumerate(predictions):

        pred_tokens, pred_labels = parse_prediction(pred.strip(), ner_dict)
        # print(f"Pred tokens: {pred_tokens}")
        # print(f"Pred labels: {pred_labels}")
        true_tokens = dataset['train'][0]["sentences"][i]["words"]
        true_labels = dataset['train'][0]["sentences"][i]["named_entities"]
        # print(f"True tokens: {true_tokens}")
        # print(f"True labels: {true_labels}")

        aligned_preds = []
        pred_idx = 0

        if len(pred_labels) != len(true_labels):
            for true_token in true_tokens:
                if pred_idx < len(pred_tokens) and pred_tokens[pred_idx] == true_token:
                    aligned_preds.append(pred_labels[pred_idx])
                    pred_idx += 1
                else:
                    aligned_preds.append(-1)
        else:
            aligned_preds = pred_labels[:]
    
        y_pred.append(aligned_preds)


    flat_true = [label for seq in y_true for label in seq]
    flat_pred = [label for seq in y_pred for label in seq]
    print(f"Flat true: {flat_true}")
    print(f"Flat pred: {flat_pred}")
    
    precision = precision_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    recall = recall_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)
    f1 = f1_score(flat_true, flat_pred, labels=[0,1,2,3,4,5,6,7,8], average='micro', zero_division=0)

    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    return f1


In [55]:
print("\nVanilla v3 NER F1 Score:")
f1Score("vanilla_v3_nd.txt", dataset, ner_dict)
print("\nRole Play NER F1 Score:")
f1Score("role_play_nd.txt", dataset, ner_dict)
print("\nCOT NER F1 Score:")
f1Score_dollar("COT_nd.txt", dataset, ner_dict)
print("\nRole Play and COT NER F1 Score:")
f1Score_dollar("role_playand_COT_nd.txt", dataset, ner_dict)
print("\nDivided Inputs NER F1 Score:")
f1Score("divided_inputs_nd.txt", dataset, ner_dict)
print("\nSelf Validating NER F1 Score:")
f1Score("self_validating_nd.txt", dataset, ner_dict)


Vanilla v3 NER F1 Score:
Flat true: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 0, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 11, 12, 0, 31, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 27, 0, 0, 0, 29, 30, 30, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 0, 29, 30, 30, 30, 30, 30, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 10, 10, 10, 0, 9, 10, 0, 0, 7, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 8, 8, 0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 15, 0, 0, 3, 0, 0, 0, 0, 9, 0, 9, 0, 0, 9, 0, 9, 0, 0, 9, 0, 0, 0, 0, 15, 16, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 15, 16, 16, 16, 0, 0, 15, 0, 0, 3, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0

0.9149797570850202