In [None]:
!pip install -q -U google-genai
!pip install pydantic
!pip install seqeval
!pip install google-generativeai

In [None]:
from datasets import load_dataset
import random

# Set the random seed for reproducibility
random.seed(0)
# Load the dataset
dataset = load_dataset("eriktks/conll2003")

# Access the train, validation, and test splits
train_data = dataset["train"]
validation_data = dataset["validation"]
test_data = dataset["test"]

# Print a sample
print(train_data[0])

In [None]:
# Print a sample from the training data
example = train_data[11001]
print("Example from the training data:")
print(example)
print("Tokens:", example["tokens"])
print("Labels:", example["ner_tags"])

In [None]:
print("Training set size:", len(train_data))
print("Validation set size:", len(validation_data))
print("Test set size:", len(test_data))

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [None]:
import csv
import os


def parse_response(tokens : list, response_labels : list, true_labels : list) -> list: 
    '''
    Store the response in a list of lists where the first element is the token, the second element 
    is the predicted label and the third is the true label
    '''
    response_labels = response_labels.split(":")
    response_labels = response_labels[1].strip('\n').split(',')
    if (len(response_labels) != len(tokens)):
        if (len(response_labels) > len(tokens)):
            response_labels = response_labels[:len(tokens)]
        if (len(response_labels) < len(tokens)):
            response_labels = response_labels + ['0'] * (len(tokens) - len(response_labels))

    temp = []
    for i in range(len(tokens)):
        pred_label = int(response_labels[i].strip())
        if(pred_label < 0 or pred_label > 8):
            print(f"Token: {tokens[i]}, Predicted Label: {pred_label}, True Label: {true_labels[i]}")
        #assert (pred_label >= 0 and pred_label <= 8), "Predicted label is out of range"
        temp.append([tokens[i], pred_label, true_labels[i]])

    return temp

def save_to_csv_vanilla(tokens : list, pred_labels : list, true_labels : list, filename : str) -> None:
    file_exists = os.path.isfile(filename)
    # Write header only if the file didn't exist before
    if not file_exists:
        with open(filename, 'a', newline='') as csvfile:
            header = ['token', 'pred', 'true']
            writer = csv.writer(csvfile)
            writer.writerow(header)
    if (len(pred_labels) == 0 and len(true_labels) == 0):
        return
    data = [[tokens[i], pred_labels[i], true_labels[i]] for i in range(len(tokens)) if pred_labels[i] != 0 or true_labels[i] != 0]
    # Remove duplicates
    # Open the file in append mode and write data to analysis purpose
    with open(filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

In [None]:
# Take predicted labels and for each token save the label in a list to be used for voting
def store_predicted_labels(pred_labels : list, votes : list) -> None:
    for i in range(len(pred_labels)):
        votes[i].append(pred_labels[i])

In [None]:
# Sample 300 random elements from the test set
sampled_test_data = random.sample(list(test_data), 300)

# Print the first few samples to verify
for i, sample in enumerate(sampled_test_data[:5]):  # Display the first 5 samples
    print(f"Sample {i + 1}:")
    print(sample)
    print("\n")

### Vanilla Method

In [None]:
from google import genai
import time

for j in range(len(sampled_test_data)):
    # Extract tokens and true labels
    tokens = sampled_test_data[j]['tokens']
    true_labels = sampled_test_data[j]['ner_tags']
    model = "gemma-3-27b-it"
    #model = 'gemini-2.5-flash-preview-04-17'
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    # Send request 
    try:
        response = client.models.generate_content(
            model=model,
            contents=f"""Given the following NER tags: {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}, determine the Named Entity Recognition (NER) tags for the following sentence.
            The sentence is: '{tokens}'
            This sentence contains exactly {len(tokens)} tokens.

            Print only the number associated with the NER tag for each of the {len(tokens)} tokens, using the tag-to-number mapping provided above.
            Your answer MUST follow the format: ner_tags: 0, 1, 2, 0, 0, 0
            The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
            Do not include any other text or explanations.
            """
        )
        time.sleep(1)

    except Exception as e:
        time.sleep(5)
        response = client.models.generate_content(
            model=model,
            contents=f"""Given the following NER tags: {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}, determine the Named Entity Recognition (NER) tags for the following sentence.
            The sentence is: '{tokens}'
            This sentence contains exactly {len(tokens)} tokens.

            Print only the number associated with the NER tag for each of the {len(tokens)} tokens, using the tag-to-number mapping provided above.
            Your answer MUST follow the format: ner_tags: 0, 1, 2, 0, 0, 0
            The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
            Do not include any other text or explanations.
            """
        )

    # Parse the response
    data = parse_response(tokens, response.text, true_labels)

    save_to_csv_vanilla(tokens, [item[1] for item in data], true_labels, "data/vanilla_test_300_gemma.csv")

In [None]:
import csv
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

label_mapping = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC'
}

category_to_index = {
    'O': 0,
    'B-PER': 1, 
    'I-PER': 2, 
    'B-ORG': 3, 
    'I-ORG': 4, 
    'B-LOC': 5, 
    'I-LOC': 6, 
    'B-MISC': 7, 
    'I-MISC': 8
    }

# Step 2: Read the CSV and convert predictions and true labels
true_seqs = []
pred_seqs = []
current_true = []
current_pred = []

with open('data/no_bio_test.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        token = row['token']
        # true_label = index_to_label[int(row['true'])]
        # pred_label = index_to_label[int(row['pred'])]
        true_label = label_mapping[int(row['true'])]
        pred_label = label_mapping[int(row['pred'])]

        current_true.append(true_label)
        current_pred.append(pred_label)

    true_seqs.append(current_true)
    pred_seqs.append(current_pred)

# Step 3: Compute metrics
print("Precision:", precision_score(true_seqs, pred_seqs))
print("Recall:", recall_score(true_seqs, pred_seqs))
print("F1 Score:", f1_score(true_seqs, pred_seqs))

# Optional detailed report
print("\nDetailed classification report:\n")
print(classification_report(true_seqs, pred_seqs))
# Precision: 0.4524647887323944
# Recall: 0.5320910973084886
# F1 Score: 0.48905803996194097

### Decomposed-QA

In [None]:
#Decomposed-QA
import google.generativeai as genai
import os
import time

def save_to_csv_qa(pred_labels, true_labels_indices, tokens, filename):
    data = []
    
    # Convert true_labels_indices to a dictionary mapping tokens to their true labels
    true_label_dict = {}
    for token, label_idx in zip(tokens, true_labels_indices):
        # Ensure we're using a valid label from label_mapping
        if label_idx in label_mapping:
            true_label_dict[token] = label_mapping[label_idx]
        else:
            print(f"Warning: Unknown label index {label_idx} for token '{token}', defaulting to 'O'")
            true_label_dict[token] = 'O'
    
    # Print for debugging
    # print("True label dictionary:", true_label_dict)
    # print("Predicted labels:", pred_labels)
    
    # Process each predicted label and their tokens
    for pred_label, tokens_str in pred_labels.items():
        # Skip invalid labels
        if pred_label == 'None' or pred_label not in category_to_index:
            print(f"Warning: Skipping invalid label '{pred_label}'")
            continue
            
        # Split by commas to get individual tokens
        token_list = [t.strip() for t in tokens_str.split(',')]
        
        for token in token_list:
            if not token:  # Skip empty tokens
                continue
                
            # Check if token exists in original tokens
            if token in tokens:
                # Direct match with a single token
                true_label = true_label_dict.get(token, 'O')
                data.append([token, category_to_index[pred_label], category_to_index[true_label]])
            else:
                # This could be a composite token or doesn't exist in our original tokens
                # Try to match with individual tokens from the original list
                matched_tokens = []
                for orig_token in tokens:
                    # Find approximate matches
                    if orig_token.lower() in token.lower() or token.lower() in orig_token.lower():
                        true_label = true_label_dict.get(orig_token, 'O')
                        data.append([orig_token, category_to_index[pred_label], category_to_index[true_label]])
                        matched_tokens.append(orig_token)
                
                # If we still couldn't match it to any original tokens
                if not matched_tokens:
                    # Only add unknown tokens if they look reasonable (not empty, not "None", etc.)
                    if token != "None" and len(token) > 1:
                        print(f"Warning: Could not match predicted token '{token}' to original tokens")
                        # We'll add it with 'O' as the true label since we can't find it
                        data.append([token, category_to_index[pred_label], category_to_index['O']])
    
    # Process any tokens that weren't in the predictions but have true labels
    for token, true_label in true_label_dict.items():
        if true_label != 'O':  # Only include non-O labels
            # Check if this token was already processed
            token_processed = any(entry[0] == token for entry in data)
            if not token_processed:
                data.append([token, category_to_index['O'], category_to_index[true_label]])
    
    # Write to CSV
    file_exists = os.path.isfile(filename)
    if not file_exists:
        with open(filename, 'a', newline='') as csvfile:
            header = ['token', 'pred', 'true']
            writer = csv.writer(csvfile)
            writer.writerow(header)
    with open(filename, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

    #print(f"Data written to {filename}")

# Function to parse model responses and extract predicted labels
def parse_model_response(response_text):
    results = {}
    
    if response_text == 'None':
        return results
        
    # Split by & to get multiple label groups
    answers = response_text.split('&')
    answers = [answer.strip() for answer in answers]
    
    for answer in answers:
        answer = answer.strip()
        
        # Skip empty answers or explicit "None" answers
        if not answer or answer == 'None':
            continue
            
        # Check if we have a proper format with colon
        if ':' in answer:
            parts = answer.split(':', 1)  # Split on first colon only
            label = parts[0].strip()
            
            # Validate the label before adding
            if label not in category_to_index:
                print(f"Warning: Skipping invalid label format: '{label}'")
                continue
                
            # Extract entities (everything after the colon)
            entities = parts[1].strip() if len(parts) > 1 else ""
            
            # Only add if we have non-empty entities
            if entities:
                results[label] = entities
        else:
            print(f"Warning: Invalid format in answer: '{answer}'")
    
    return results


label_groups = [(1, 2), (3, 4), (5, 6), (7, 8)]

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel('gemini-2.5-flash-preview-04-17')  # or gemma-3-27b-it
#model = genai.GenerativeModel('gemini-2.0-flash-lite')

for j in range(len(sampled_test_data)):
    tokens = sampled_test_data[j]['tokens']
    true_labels = sampled_test_data[j]['ner_tags']
    results = {}
    
    chat = model.start_chat()
    # Send general instructions only at the beginning of the conversation
    intro_message = (
        f"""Given the following NER tags: {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}, determine the Named Entity Recognition (NER) tags for the following sentence.
        If there are multiple tokens for a single category then separate the list of tokens with commas.
        
        The sentence is: '{tokens}'.
        This sentence contains exactly {len(tokens)} tokens.
        Remember that a token can only be classified once."""
    )
    # Send message to the chat
    try:
        chat.send_message(intro_message)
        time.sleep(5)
    except Exception as e:
        # If the request fails, try again after 30 seconds to avoid rate limits
        time.sleep(30)
        chat.send_message(intro_message)

    # Send a question for each label group in BIO format
    for group in label_groups:
        label_1 = label_mapping[group[0]]
        label_2 = label_mapping[group[1]]
        prompt = (
            f"""Which are the tokens labeled as '{label_1}' and '{label_2}' in the text? 
            If there are multiple tokens for a single category then separate the list of tokens with commas.
            The output must be in the format: 'B-PER: entity1, entity2, entity3' & 'I-PER: entity4, entity5'.
            If only one category is present, then the output should be: 'B-PER: entity1, entity2, entity3' or 'I-PER: entity4, entity5'.
            If both categories have no entities, just answer with 'None'."""
        )

        try:
            response = chat.send_message(prompt)
            time.sleep(5)
        except Exception as e:
            time.sleep(30)
            response = chat.send_message(prompt)

        try:
            results.update(parse_model_response(response.text))
        except Exception as e:
            print(f"Error parsing response: {e}")
            continue
     
    # Save result to CSV with ad-hoc functin for Basic Decomposed-QA
    save_to_csv_qa(results, true_labels, tokens, "data/modified_qa_test_light_25.csv")

### Improved Prompt Method

In [None]:
#from google import genai
from google import generativeai as genai
import time

for sentence in range(len(sampled_test_data)):
    # Extract tokens and true labels
    tokens = sampled_test_data[sentence]['tokens']
    true_labels = sampled_test_data[sentence]['ner_tags']
    #model = "gemma-3-27b-it"
    genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
    #model = 'gemini-2.5-flash-preview-04-17'
    model = genai.GenerativeModel(model_name="gemma-3-27b-it")
    # Send request 
    time.sleep(3)
    response = model.generate_content(
        contents=f""" Task: Perform Named Entity Recognition (NER) tagging on the sentence below using the BIO tagging scheme.
        Sentence: {tokens}
        Context: You are a world-class linguist and NER expert. I am the CEO of the most influential NER research company, and I’m asking for your highest-quality tagging for this sentence. Consider each token carefully, and use deep contextual understanding. Think through token sequences internally—such as how BIO tags depend on previous tokens—but do not show your reasoning in the output.
        Instructions:
        Use the following tag-to-ID mapping:
        {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}
        Output the corresponding tag ID for each of the {len(tokens)} tokens, in the order they appear.
        Format strictly as: ner_tags: 0, 1, 2, 0, 0, 0 (a comma-separated list of integers).
        The output must contain exactly {len(tokens)} tag IDs — one for each token.
        Do not include any additional commentary, explanation, or formatting beyond the required output.
        Note: Apply internal chain-of-thought reasoning as needed to preserve the logic of the BIO tagging format, particularly for entity continuation (I-) tags. However, do not output any intermediate steps.""",
        generation_config={
        "temperature": 0.2,
        # Optionally: "top_k": ..., "top_p": ...
    })
    
    # Parse the response
    data = parse_response(tokens, response.text, true_labels)

    save_to_csv_vanilla(tokens, [item[1] for item in data], true_labels, "data/improved_prompt_method_gemma_temp_0.csv")

### POS TAGS

In [None]:
from google import genai
import time

for j in range(len(sampled_test_data)):
    # Extract tokens and true labels
    tokens = sampled_test_data[j]['tokens']
    true_labels = sampled_test_data[j]['ner_tags']
    model = "gemma-3-27b-it"
    #model = 'gemini-2.5-flash-preview-04-17'
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    # Send request 
    try:
        response = client.models.generate_content(
            model=model,
            contents=f"""Given the following NER tags: {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}, determine the Named Entity Recognition (NER) tags for the following sentence.
            The sentence is: '{tokens}'.
            Consider also this POS tags: {{'"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9, 'CC': 10, 'CD': 11, 'DT': 12,
                'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18, 'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23,
                'NNS': 24, 'NN|SYM': 25, 'PDT': 26, 'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33,
                'SYM': 34, 'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42, 'WDT': 43,
                'WP': 44, 'WP$': 45, 'WRB': 46}}.
            The POS tags for the corresponding tokens are: {sampled_test_data[j]['pos_tags']} 
            This sentence contains exactly {len(tokens)} tokens.

            Print only the number associated with the NER tag for each of the {len(tokens)} tokens, using the tag-to-number mapping provided above.
            Your answer MUST follow the format: ner_tags: 0, 1, 2, 0, 0, 0
            The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
            Do not include any other text or explanations.
            """
        )
        time.sleep(1)

    except Exception as e:
        time.sleep(5)
        response = client.models.generate_content(
            model=model,
            contents=f"""Given the following NER tags: {{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}}, determine the Named Entity Recognition (NER) tags for the following sentence.
            The sentence is: '{tokens}'.
            Consider also this POS tags: {{'"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9, 'CC': 10, 'CD': 11, 'DT': 12,
                'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18, 'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23,
                'NNS': 24, 'NN|SYM': 25, 'PDT': 26, 'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33,
                'SYM': 34, 'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42, 'WDT': 43,
                'WP': 44, 'WP$': 45, 'WRB': 46}}.
            The POS tags for the corresponding tokens are: {sampled_test_data[j]['pos_tags']} 
            This sentence contains exactly {len(tokens)} tokens.
            Print only the number associated with the NER tag for each of the {len(tokens)} tokens, using the tag-to-number mapping provided above.
            Your answer MUST follow the format: ner_tags: 0, 1, 2, 0, 0, 0
            The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
            Do not include any other text or explanations.
            """
        )

    # Parse the response
    data = parse_response(tokens, response.text, true_labels)

    save_to_csv_vanilla(tokens, [item[1] for item in data], true_labels, "data/pos_tags_train.csv")

### Test NO BIO

In [None]:
import csv
import os

begin_tags = {1,3,5,7}

def parse_response_no_bio(tokens : list, response_labels : list, true_labels : list) -> list: 
    '''
    Store the response in a list of lists where the first element is the token, the second element 
    is the predicted label and the third is the true label
    '''
    response_labels = response_labels.split(":")
    response_labels = response_labels[1].strip('\n').split(',')
    if (len(response_labels) != len(tokens)):
        if (len(response_labels) > len(tokens)):
            response_labels = response_labels[:len(tokens)]
        if (len(response_labels) < len(tokens)):
            response_labels = response_labels + ['0'] * (len(tokens) - len(response_labels))

    temp = []
    for i in range(len(tokens)):
        if i > 0 and temp[i-1][1] in begin_tags and int(response_labels[i].strip()) in begin_tags:
            # If the previous token is a begin tag and this is not '0', we assume it's a continuation
            pred_label = int(response_labels[i].strip()) + 1
        elif i > 0 and int(response_labels[i].strip()) in begin_tags and (temp[i-1][1] - 1) == int(response_labels[i].strip()):
            # if previous token is an inside token and the current is in the same category, we assume it's a continuation
            pred_label = int(response_labels[i].strip()) + 1
        else:
            # Otherwise, we take the label as is
            pred_label = int(response_labels[i].strip())
        if(pred_label < 0 or pred_label > 8):
            print(f"Token: {tokens[i]}, Predicted Label: {pred_label}, True Label: {true_labels[i]}")
        #assert (pred_label >= 0 and pred_label <= 8), "Predicted label is out of range"
        temp.append([tokens[i], pred_label, true_labels[i]])

    return temp

In [None]:
from google import genai
import time
import os

for j in range(len(sampled_test_data)):
    # Extract tokens and true labels
    tokens = sampled_test_data[j]['tokens']
    true_labels = sampled_test_data[j]['ner_tags']
    model = "gemma-3-27b-it"
    #model = 'gemini-2.5-flash-preview-04-17'
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    # Send request 
    try:
        response = client.models.generate_content(
            model=model,
        #     contents=f"""Given the following NER tags: {{'O': 0, 'PER': 1, 'ORG': 3, 'LOC': 5, 'MISC': 7}}, determine the Named Entity Recognition (NER) tags for the following sentence.
        #     The sentence is: '{tokens}'
        #     This sentence contains exactly {len(tokens)} tokens.

        #     Print only the number associated with the NER tag for each of the {len(tokens)} tokens, using the tag-to-number mapping provided above.
        #     Your answer MUST follow the format: ner_tags: 0, 1, 0, 4, 0, 0
        #     The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
        #     Do not include any other text or explanations.
        #     """
        # )
            contents=f"""Given entity label set: {{'O': 0, 'PERSON': 1, 'ORGANIZATION': 3, 'LOCATION': 5, 'MISCELLANEOUS': 7}}.
            Based on the given entity label set, please recognize the named entities in the given text.
            Text: '{tokens}'.
            Your answer MUST follow the format: ner_tags: 0, 1, 0, 4, 0, 0
            The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
            Do not include any other text or explanations.
            """
        )
        time.sleep(1)

    except Exception as e:
        time.sleep(5)
        response = client.models.generate_content(
            model=model,
            contents=f"""Given entity label set: {{'O': 0, 'PERSON': 1, 'ORGANIZATION': 3, 'LOCATION': 5, 'MISCELLANEOUS': 7}}.
            Based on the given entity label set, please recognize the named entities in the given text.
            Text: '{tokens}'.
            Your answer MUST follow the format: ner_tags: 0, 1, 0, 4, 0, 0
            The number of output NER tags MUST be exactly {len(tokens)}, one for each token in the order they appear in the sentence.
            Do not include any other text or explanations.
            """
        )

    # Parse the response
    data = parse_response_no_bio(tokens, response.text, true_labels)

    save_to_csv_vanilla(tokens, [item[1] for item in data], true_labels, "data/no_bio_test_2.csv")