In [None]:
!pip install -q -U google-genai
!pip install pydantic
!pip install seqeval
!pip install google-generativeai
!pip install datasets[all]
!pip install pandas
!pip install ollama


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
# Print a sample from the training data
example = train_data[11001]
print("Example from the training data:")
print(example)
print("Tokens:", example["tokens"])
print("Labels:", example["ner_tags"])

In [None]:
# Print a sample from the training data
example = train_data[11001]
print("Example from the training data:")
print(example)
print("Tokens:", example["tokens"])
print("Labels:", example["ner_tags"])

In [None]:
import csv
import os


def parse_response(tokens : list, response_labels : list, true_labels : list) -> list: 
    '''
    Store the response in a list of lists where the first element is the token, the second element 
    is the predicted label and the third is the true label
    '''
    response_labels = response_labels.split(":")
    response_labels = response_labels[1].strip('\n').split(',')
    if (len(response_labels) != len(tokens)):
        if (len(response_labels) > len(tokens)):
            response_labels = response_labels[:len(tokens)]
        if (len(response_labels) < len(tokens)):
            response_labels = response_labels + ['0'] * (len(tokens) - len(response_labels))

    temp = []
    for i in range(len(tokens)):
        pred_label = int(response_labels[i].strip())
        
        assert (pred_label >= 0 and pred_label <= 8), "Predicted label is out of range"
        temp.append([tokens[i], pred_label, true_labels[i]])

    return temp

def save_to_csv_vanilla(tokens : list, pred_labels : list, true_labels : list, filename : str) -> None:
    file_exists = os.path.isfile(filename)
    # Write header only if the file didn't exist before
    if not file_exists:
        with open(filename, 'a', newline='') as csvfile:
            header = ['token', 'pred', 'true']
            writer = csv.writer(csvfile)
            writer.writerow(header)
    data = [[tokens[i], pred_labels[i], true_labels[i]] for i in range(len(tokens)) if pred_labels[i] != 0 or true_labels[i] != 0]
    # Remove duplicates
    # Open the file in append mode and write data to analysis purpose
    with open(filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

In [None]:
# Take predicted labels and for each token save the label in a list to be used for voting
def store_predicted_labels(pred_labels : list, votes : list) -> None:
    for i in range(len(pred_labels)):
        votes[i].append(pred_labels[i])

In [None]:
import random

random.seed(0)

# Sample 300 random elements from the test set
sampled_test_data = random.sample(list(test_data), 300)

# Print the first few samples to verify
for i, sample in enumerate(sampled_test_data[:5]):  # Display the first 5 samples
    print(f"Sample {i + 1}:")
    print(sample)
    print("\n")

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

dependency_trees = []

for sample in sampled_test_data:
    tokens = sample["tokens"]
    sentence = " ".join(tokens)
    doc = nlp(sentence)
    tree = [
        {
            "text": token.text,
            "dep": token.dep_,
            "head": token.head.text,
            "pos": token.pos_,
            "index": token.i,
            "head_index": token.head.i
        }
        for token in doc
    ]
    dependency_trees.append(tree)

# Link the two lists
for i, sample in enumerate(sampled_test_data):
    sample['dependency_tree'] = dependency_trees[i]

# Now each sample has the tree inside
print(sampled_test_data[0]['dependency_tree'])


In [None]:
import ollama
print(dir(ollama))


In [None]:

from ollama import Client

client = Client()

models = client.list()
print(models)



********************************************************************************************************************
Vanilla

You are a strict NER tagging system.

Given the following NER tags:
{{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

Your task is to assign the correct tag number to each token in this sentence:
{tokens}

This sentence contains exactly {len(tokens)} tokens.

Respond ONLY with:
ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

Do NOT include explanations, thoughts, or any other content.
Do NOT write anything before or after "ner_tags: ...".
Just print the sequence in the format specified.
********************************************************************************************************************

Word-level named entity reflection (doesn't work well)

You are a strict NER tagging system.

Given the following NER tags:
{{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

Your task is to assign the correct tag number to each token in this sentence:
{tokens}

This sentence contains exactly {len(tokens)} tokens.

For each word in the text, generate a short summary (maximum 10 words) reasoning about its possible named entity category.
Respond ONLY with:
1)explanation for each word.
2)ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)
Just print the sequence in the format specified.
********************************************************************************************************************


Multi-turn adaptive refinement (works a bit better than simple vanilla)

You are a strict NER tagging system.

Given the following NER tags:
{{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

Your task is to assign the correct tag number to each token in this sentence:
{tokens}

This sentence contains exactly {len(tokens)} tokens.

First, extract potential named entities, then refine the list by validating their relationships within the text.
Finally, consolidate the results.

Respond ONLY with:
1)the explanation as indicated above.
2)ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)
Just print the sequence in the format specified.
********************************************************************************************************************


Dependency-based entity validation (doesn't work well)

You are a strict NER tagging system.

Given the following NER tags:
{{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

Your task is to assign the correct tag number to each token in this sentence:
{tokens}

This sentence contains exactly {len(tokens)} tokens.

You are also given the dependency tree of the sentence in the following format:
{dependency_tree_str}

First, extract potential named entities based on the tokens.
Then, refine and validate the entities by analyzing their syntactic relationships according to the dependency tree.
Finally, consolidate the results into a final tagging.

Respond ONLY with:
ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

Just print the sequence in the format specified.
********************************************************************************************************************


In [None]:
import re
import os
import csv
import json
from datasets import load_dataset
from ollama import Client

# Initialize client
client = Client()

def clean_response(text):
    """Cleans the model's output to extract only the numbers."""
    # Remove <think>...</think> blocks
    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()

    # Look for the pattern 'ner_tags: 0, 1, 2, ...'
    match = re.search(r'ner_tags\s*:\s*([0-9,\s]+)', cleaned)
    if match:
        number_str = match.group(1)
    else:
        # If 'ner_tags:' is not found, but there are still numbers, extract them all
        number_str = cleaned

    # Extract all integers as strings
    number_list = re.findall(r'\d+', number_str)
    return number_list



for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    true_labels = sampled_test_data[j]['ner_tags']
    dependency_tree = dependency_trees[j]

    # Convert dependency_tree to JSON string
    dependency_tree_str = json.dumps(dependency_tree, indent=2)

    prompt = f"""You are a strict NER tagging system.

Given the following NER tags:
{{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}

Your task is to assign the correct tag number to each token in this sentence:
{tokens}

This sentence contains exactly {len(tokens)} tokens.

You are also given the dependency tree of the sentence in the following format:
{dependency_tree_str}

First, extract potential named entities based on the tokens.
Then, refine and validate the entities by analyzing their syntactic relationships according to the dependency tree.
Finally, consolidate the results into a final tagging.

Respond ONLY with:
ner_tags: x, x, x, ..., x  ← (exactly {len(tokens)} integers)

Just print the sequence in the format specified.
"""

    try:
        response = client.generate(model="deepseek-r1:14b", prompt=prompt)
        raw_text = response.response
        pred_tags_str = clean_response(raw_text)

        if len(pred_tags_str) != len(tokens):
            raise ValueError(f"Error at sentence {j}: Number of labels ({len(pred_tags_str)}) does not match number of tokens ({len(tokens)})\nRaw response: {raw_text}")

        # Parsing (conversion and validation)
        parsed_data = parse_response(tokens, f"ner_tags: {','.join(pred_tags_str)}", true_labels)

        # Debug print
        print(f"[✓] Sentence {j}")
        print("Tokens:    ", tokens)
        print("Predicted: ", [x[1] for x in parsed_data])
        print("True:      ", [x[2] for x in parsed_data])
        print("---")

        # Save to file
        save_to_csv_vanilla(tokens, [x[1] for x in parsed_data], true_labels, "data/vanilla_test_300_ds_14b_DBEVS.csv")

    except Exception as e:
        print(f"[!] Error at sentence {j}: {e}")
        print(f"Raw response: {response.response if 'response' in locals() else 'None'}")
        print("---")
