In [None]:
!pip install -q -U google-genai
!pip install pydantic
!pip install seqeval
!pip install google-generativeai
!pip install datasets[all]
!pip install pandas
!pip install ollama


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
from datasets import load_dataset
import random

# Set the random seed for reproducibility
random.seed(0)
# Load the dataset
dataset = load_dataset("eriktks/conll2003")

# Access the train, validation, and test splits
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

# Print a sample
print(train_data[0])

In [None]:
import csv
import os


def parse_response(tokens : list, response_labels : list, true_labels : list) -> list: 
    '''
    Store the response in a list of lists where the first element is the token, the second element 
    is the predicted label and the third is the true label
    '''
    response_labels = response_labels.split(":")
    response_labels = response_labels[1].strip('\n').split(',')
    if (len(response_labels) != len(tokens)):
        if (len(response_labels) > len(tokens)):
            response_labels = response_labels[:len(tokens)]
        if (len(response_labels) < len(tokens)):
            response_labels = response_labels + ['0'] * (len(tokens) - len(response_labels))

    temp = []
    for i in range(len(tokens)):
        pred_label = int(response_labels[i].strip())
        if(pred_label < 0 or pred_label > 8):
            print(f"Token: {tokens[i]}, Predicted Label: {pred_label}, True Label: {true_labels[i]}")
        #assert (pred_label >= 0 and pred_label <= 8), "Predicted label is out of range"
        temp.append([tokens[i], pred_label, true_labels[i]])

    return temp

def save_to_csv_vanilla(tokens : list, pred_labels : list, true_labels : list, filename : str) -> None:
    file_exists = os.path.isfile(filename)
    # Write header only if the file didn't exist before
    if not file_exists:
        with open(filename, 'a', newline='') as csvfile:
            header = ['token', 'pred', 'true']
            writer = csv.writer(csvfile)
            writer.writerow(header)
    if (len(pred_labels) == 0 and len(true_labels) == 0):
        return
    data = [[tokens[i], pred_labels[i], true_labels[i]] for i in range(len(tokens)) if pred_labels[i] != 0 or true_labels[i] != 0]
    # Remove duplicates
    # Open the file in append mode and write data to analysis purpose
    with open(filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

In [None]:
import csv
import os

begin_tags = {1,3,5,7}

def parse_response_no_bio(tokens : list, response_labels : list, true_labels : list) -> list: 
    '''
    Store the response in a list of lists where the first element is the token, the second element 
    is the predicted label and the third is the true label
    '''
    response_labels = response_labels.split(":")
    response_labels = response_labels[1].strip('\n').split(',')
    if (len(response_labels) != len(tokens)):
        if (len(response_labels) > len(tokens)):
            response_labels = response_labels[:len(tokens)]
        if (len(response_labels) < len(tokens)):
            response_labels = response_labels + ['0'] * (len(tokens) - len(response_labels))

    temp = []
    for i in range(len(tokens)):
        if i > 0 and temp[i-1][1] in begin_tags and int(response_labels[i].strip()) in begin_tags:
            # If the previous token is a begin tag and this is not '0', we assume it's a continuation
            pred_label = int(response_labels[i].strip()) + 1
        elif i > 0 and int(response_labels[i].strip()) in begin_tags and (temp[i-1][1] - 1) == int(response_labels[i].strip()):
            # if previous token is an inside token and the current is in the same category, we assume it's a continuation
            pred_label = int(response_labels[i].strip()) + 1
        else:
            # Otherwise, we take the label as is
            pred_label = int(response_labels[i].strip())
        if(pred_label < 0 or pred_label > 8):
            print(f"Token: {tokens[i]}, Predicted Label: {pred_label}, True Label: {true_labels[i]}")
        #assert (pred_label >= 0 and pred_label <= 8), "Predicted label is out of range"
        temp.append([tokens[i], pred_label, true_labels[i]])

    return temp

In [None]:
# Take predicted labels and for each token save the label in a list to be used for voting
def store_predicted_labels(pred_labels : list, votes : list) -> None:
    for i in range(len(pred_labels)):
        votes[i].append(pred_labels[i])

In [None]:
import random

random.seed(0)

# Sample 100 random elements from the test set
sampled_test_data = random.sample(list(test_data), 100)

# Print the first few samples to verify
for i, sample in enumerate(sampled_test_data[:5]):  # Display the first 5 samples
    print(f"Sample {i + 1}:")
    print(sample)
    print("\n")

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

dependency_trees = []

for sample in sampled_test_data:
    tokens = sample["tokens"]
    sentence = " ".join(tokens)
    doc = nlp(sentence)
    tree = [
        {
            "text": token.text,
            "dep": token.dep_,
            "head": token.head.text,
            "pos": token.pos_,
            "index": token.i,
            "head_index": token.head.i
        }
        for token in doc
    ]
    dependency_trees.append(tree)

# Link the two lists
for i, sample in enumerate(sampled_test_data):
    sample['dependency_tree'] = dependency_trees[i]

# Now each sample has the tree inside
print(sampled_test_data[0]['dependency_tree'])


In [None]:
import re
import os
import csv
import json
from ollama import Client

# Initialize client
client = Client()

def clean_response(text):
    """Cleans the model's output to extract only the numbers."""
    # Remove <think>...</think> blocks
    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()

    # Look for the pattern 'ner_tags: 0, 1, 2, ...'
    match = re.search(r'ner_tags\s*:\s*([0-9,\s]+)', cleaned)
    if match:
        number_str = match.group(1)
    else:
        # If 'ner_tags:' is not found, but there are still numbers, extract them all
        number_str = cleaned

    # Extract all integers as strings
    number_list = re.findall(r'\d+', number_str)
    return number_list


def format_example(ex):
    return f"""Tokens: {ex['tokens']}
POS tags: {ex['pos_tags']}
NER tags: {ex['ner_tags']}"""

# Using 3 random examples in the prompt
example1 = train_data[11000]
example2 = train_data[12000]
example3 = train_data[13000]


In [None]:
def format_example_NOBIO(ex):
    # Mappa BIO in NOBIO per gli esempi
    bio_to_nobio = {0: 0, 1: 1, 2: 1, 3: 3, 4: 3, 5: 5, 6: 5, 7: 7, 8: 7}
    nobio_tags = [bio_to_nobio[tag] for tag in ex['ner_tags']]
    return f"""Tokens: {ex['tokens']}
    POS tags: {ex['pos_tags']}
    NER tags: {nobio_tags}"""

********************************************************************************************************************
Vanilla
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
Vanilla with no BIO tagging
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

********************************************************************************************************************
Word-level named entity reflection (doesn't work well)
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    For each word in the text, generate a short summary (maximum 10 words) reasoning about its possible named entity category.
    Respond ONLY with:
    1)explanation for each word.
    2)ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_WLNER.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
Word-level named entity reflection with no BIO tagging
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    For each word in the text, generate a short summary (maximum 10 words) reasoning about its possible named entity category.
    Respond ONLY with:
    1)explanation for each word.
    2)ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_WLNER_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

********************************************************************************************************************
Multi-turn adaptive refinement (works a bit better than simple vanilla with 300 samples, worse with 100 samples)
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    First, extract potential named entities, then refine the list by validating their relationships within the text.
    Finally, consolidate the results.

    Respond ONLY with:
    1)the explanation as indicated above.
    2)ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_MTAR.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
Multi-turn adaptive refinement with no BIO tagging
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    First, extract potential named entities, then refine the list by validating their relationships within the text.
    Finally, consolidate the results.

    Respond ONLY with:
    1)the explanation as indicated above.
    2)ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_MTAR_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

********************************************************************************************************************
Dependency-based entity validation (doesn't work well)
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    You are also given the dependency tree of the sentence in the following format:
    {dependency_tree_str}

    First, extract potential named entities based on the tokens.
    Then, refine and validate the entities by analyzing their syntactic relationships according to the dependency tree.
    Finally, consolidate the results into a final tagging.

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_DBEV.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
Dependency-based entity validation with no BIO 
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    This sentence contains exactly {len(tokens)} tokens.

    You are also given the dependency tree of the sentence in the following format:
    {dependency_tree_str}

    First, extract potential named entities based on the tokens.
    Then, refine and validate the entities by analyzing their syntactic relationships according to the dependency tree.
    Finally, consolidate the results into a final tagging.

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_DBEV_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

********************************************************************************************************************
POS-guided named entity recognition (works better than vanilla, but with BIO included)
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    You are also given the POS tag (part-of-speech) for each token.
    POS tags (in order, one per token):
    {pos_tags}

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_POSGNER.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
POS-guided named entity recognition with no BIO tagging
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    You are also given the POS tag (part-of-speech) for each token.
    POS tags (in order, one per token):
    {pos_tags}

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_POSGNER_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

********************************************************************************************************************
POS-dependency hybrid NER (doesn't work well)
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    You are also given the POS tag (part-of-speech) for each token.
    POS tags (in order, one per token):
    {pos_tags}
        
    You are also given the dependency tree of the sentence in the following format:
    {dependency_tree_str}

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_POSDHNER.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
POS-dependency hybrid NER with no BIO tagging
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    You are also given the POS tag (part-of-speech) for each token.
    POS tags (in order, one per token):
    {pos_tags}
        
    You are also given the dependency tree of the sentence in the following format:
    {dependency_tree_str}

    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_POSDHNER_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

********************************************************************************************************************
Example-driven POS NER
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    You are also given the POS tag (part-of-speech) for each token.
    POS tags (in order, one per token):
    {pos_tags}
        
    You are also given three correct examples.
    Here are three correct examples:

    Example 1:
    {format_example(example1)}

    Example 2:
    {format_example(example2)}

    Example 3:
    {format_example(example3)}
        
    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_EDPOSNER.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")


********************************************************************************************************************
Example-driven POS NER with no BIO tagging
********************************************************************************************************************

In [None]:
for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    pos_tags = sampled_test_data[j]['pos_tags']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

    Given the following NER tags:
    {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}

    Your task is to assign the correct tag number to each token in this sentence:
    {tokens}

    You are also given the POS tag (part-of-speech) for each token.
    POS tags (in order, one per token):
    {pos_tags}
        
    You are also given three correct examples.
    Here are three correct examples:

    Example 1:
    {format_example(example1)}

    Example 2:
    {format_example(example2)}

    Example 3:
    {format_example(example3)}
        
    Respond ONLY with:
    ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

    Do NOT include explanations, thoughts, or any other content.
    Do NOT write anything before or after "ner_tags: ...".
    Just print the sequence in the format specified.
    """

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        # Parsing (conversion and validation)
        parsed_data = parse_response_no_bio(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data100_ds/vanilla_test_100_ds_14b_EDPOSNER_NOBIO.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")

In [None]:
filename = 'data100_ds/vanilla_test_100_ds_14b_EDPOSNER_NOBIO_ONEEX.csv'

# Corrects predicted values if they fall outside the range [0, 8], sets them to 0 (which is not counted in the evaluation)
rows = []
with open(filename, newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames
    for row in reader:
        try:
            pred = int(row['pred'])
        except ValueError:
            pred = 0
        if pred < 0 or pred > 8:
            pred = 0
        row['pred'] = str(pred)
        rows.append(row)


with open(filename, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

In [None]:
import csv
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

label_mapping = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC'
}

category_to_index = {
    'O': 0,
    'B-PER': 1, 
    'I-PER': 2, 
    'B-ORG': 3, 
    'I-ORG': 4, 
    'B-LOC': 5, 
    'I-LOC': 6, 
    'B-MISC': 7, 
    'I-MISC': 8
    }

# Step 2: Read the CSV and convert predictions and true labels
true_seqs = []
pred_seqs = []
current_true = []
current_pred = []

with open('data100_ds/vanilla_test_100_ds_14b_EDPOSNER_NOBIO.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        token = row['token']                                                                                                                                                                                                                                                                                                                    
        # true_label = index_to_label[int(row['true'])]
        # pred_label = index_to_label[int(row['pred'])]
        true_label = label_mapping[int(row['true'])]
        pred_label = label_mapping[int(row['pred'])]

        current_true.append(true_label)
        current_pred.append(pred_label)

    true_seqs.append(current_true)
    pred_seqs.append(current_pred)

# Step 3: Compute metrics
print("Precision:", precision_score(true_seqs, pred_seqs))
print("Recall:", recall_score(true_seqs, pred_seqs))
print("F1 Score:", f1_score(true_seqs, pred_seqs))

# Optional detailed report
print("\nDetailed classification report:\n")
print(classification_report(true_seqs, pred_seqs))
#vanilla_test_100_ds_14b_POSGNER :  0.5042492917847027 

#vanilla_test_100_ds_14b_EDPOSNER : F1 Score: 0.3553008595988538
#vanilla_test_100_ds_14b_EDPOSNER_NOBIO : F1 Score: 0.46511627906976744