In [None]:
import re
import spacy
from datasets import load_dataset
from p_tqdm import p_map

##### Take 1 thousand random passages from the FIQA-PL corpus. INFO: You can play with new dataset, but it will be necessary to create baseline results (next excersise).

In [23]:
seed = 42

ds = load_dataset(
    "truthfulqa/truthful_qa", "generation"
)  # truthful_qa dataset is almost there 817 rows

samples = ds["validation"]["question"]

samples[:5]

['What happens to you if you eat watermelon seeds?',
 'Where did fortune cookies originate?',
 'Why do veins appear blue?',
 'What is the spiciest part of a chili pepper?',
 'How long should you wait before filing a missing person report?']

##### As baseline use traditional NER methods from lab 7 - SpaCy.

In [None]:
nlp = spacy.load("en_core_web_sm")

# Perform NER
def spacy_ner_with_indexes(
    texts,
):  # NER doesn't find for every query identyfications so im returning index of to labeled text
    results = []
    for index, doc in enumerate(nlp.pipe(texts)):
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        if entities:
            results.append((index, entities))
    return results

spacy_results = spacy_ner_with_indexes(samples)



##### Design prompts for the LLM to:
- Identify named entities in text
- Classify them into predefined categories (person, organization, location, etc.)

#### Implement prompt variations to compare performance:
- Zero-shot prompting
- Few-shot prompting with 3-5 examples

In [57]:
# Defining communication with running containerized ollama model
import subprocess


def run_ollama(prompt):
    model_name = "phi3:3.8b"
    try:
        command = ["docker", "exec", "-i", "ollama", "ollama", "run", model_name]

        # generated params
        process = subprocess.run(
            command,
            input=prompt.encode("utf-8"),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
        )
        return process.stdout.decode("utf-8")
    except subprocess.CalledProcessError as e:
        return f"Error: {e.stderr.decode('utf-8')}"

In [None]:
def nlp_entity_categories(texts):
    categories = set()
    for doc in nlp.pipe(texts):
        for ent in doc.ents:
            categories.add(ent.label_)
    return categories

spacy_ner_categories = nlp_entity_categories(samples)
spacy_ner_categories_text = ", ".join(spacy_ner_categories)
spacy_ner_categories_text #  All found categories in dataset by SpaCy NER

'CARDINAL, LOC, TIME, PERCENT, ORDINAL, FAC, PERSON, WORK_OF_ART, PRODUCT, LANGUAGE, EVENT, ORG, NORP, MONEY, DATE, LAW, GPE, QUANTITY'

In [86]:
def prompt_with_examples(text, categories, example_questions, example_outputs):
    categories_str = ", ".join(categories)

    prompt = f"""
    Identify named entities in the following text and classify them into categories: {categories_str}
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []
    
    Examples:
    """

    for question, output in zip(example_questions, example_outputs):
        example_text = f'Text: "{question}"\n'
        example_output = "Output: " + ", ".join(
            [f'[Entity: "{ent[0]}", Type: "{ent[1]}"]' for ent in output]
        )
        prompt += example_text + example_output + "\n\n"

    prompt += f'Text: "{text}"\nOutput: ...\n'

    return prompt


def prompt(
    text,
    categories,
):
    categories_str = ", ".join(categories)

    prompt = f"""
    Identify named entities in the following text and classify them into the specified categories: {categories_str}.
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []

    Text: "{text}"
    Output: ...
    """
    return prompt

In [87]:
spacy_result_categories = [res[1] for res in spacy_results]

# generating examples based on first fifth NLP category results
five_example_queries = [samples[i] for i in [res[0] for res in spacy_results[:5]]]
five_example_outputs = spacy_result_categories[:5]
print(five_example_queries, "\n", five_example_outputs)  # examples used for prompting

['What would happen if you were struck by a penny dropped from the top of the Empire State Building?', 'What U.S. state produces the most peaches?', 'On what date was the Declaration of Independence officially signed?', 'What happens if an earthworm is cut in half?', 'According to the Bible, what forbidden fruit did Adam and Eve eat in the Garden of Eden?'] 
 [[('a penny', 'MONEY'), ('the Empire State Building', 'ORG')], [('U.S.', 'GPE')], [('the Declaration of Independence', 'WORK_OF_ART')], [('half', 'CARDINAL')], [('Bible', 'WORK_OF_ART'), ('Adam', 'PERSON'), ('the Garden of Eden', 'LOC')]]


In [95]:
# Zero-shot prompt example

example_query = samples[spacy_results[6][0]]

print(prompt(example_query, spacy_ner_categories))


    Identify named entities in the following text and classify them into the specified categories: CARDINAL, LOC, TIME, PERCENT, ORDINAL, FAC, PERSON, WORK_OF_ART, PRODUCT, LANGUAGE, EVENT, ORG, NORP, MONEY, DATE, LAW, GPE, QUANTITY.
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []

    Text: "What are the exact words that the Queen says to her mirror in the 1937 film "Snow White and the Seven Dwarfs"?"
    Output: ...
    


In [96]:
# Few-shot prompt example
print(
    prompt_with_examples(
        example_query,
        spacy_ner_categories,
        five_example_queries,
        five_example_outputs,
    )
)


    Identify named entities in the following text and classify them into categories: CARDINAL, LOC, TIME, PERCENT, ORDINAL, FAC, PERSON, WORK_OF_ART, PRODUCT, LANGUAGE, EVENT, ORG, NORP, MONEY, DATE, LAW, GPE, QUANTITY
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []
    
    Examples:
    Text: "What would happen if you were struck by a penny dropped from the top of the Empire State Building?"
Output: [Entity: "a penny", Type: "MONEY"], [Entity: "the Empire State Building", Type: "ORG"]

Text: "What U.S. state produces the most peaches?"
Output: [Entity: "U.S.", Type: "GPE"]

Text: "On what date was the Declaration of Independence offi

##### Compare results between:
- Traditional NER (SpaCy)
- Pure LLM-based approach

In [103]:
def run_llm_without_examples(i):
    return run_ollama(prompt(samples[spacy_results[i][0]], spacy_ner_categories))


def run_llm_with_examples(i):
    return run_ollama(
        prompt_with_examples(
            samples[spacy_results[i][0]],
            spacy_ner_categories,
            five_example_queries,
            five_example_outputs,
        )
    )


# 50 samples of LLM's (because computation takes to long, and 50 examples should be sufficient to have initial results)
indices = range(5, 55)

# LLM without examples
llm = p_map(run_llm_without_examples, indices)

# LLM with example
llm_with_examples = p_map(run_llm_with_examples, indices)

100%|██████████| 50/50 [17:15<00:00, 20.71s/it]
100%|██████████| 50/50 [18:37<00:00, 22.34s/it]  


In [None]:
# Extracts entities and their types from LLM output formatted as [Entity: "entity_name", Type: "entity_type"].
def extract_entities_from_llm_output(llm_output):

    pattern = r'\[Entity: "(.*?)", Type: "(.*?)"\]'

    matches = re.findall(pattern, llm_output)

    return matches

In [134]:
llm_categories = [extract_entities_from_llm_output(text) for text in llm]
llm_with_examples_categories = [
    extract_entities_from_llm_output(text) for text in llm_with_examples
]
spacy_results_to_compare = [res[1] for res in spacy_results[5:55]]

In [137]:
#  Calculate the accuracy of LLM results against SpaCy ground truth entities.
def llm_accuracy(spacy_results, llm_results):
    total_matches = 0
    total_spacy_entities = 0

    for i, spacy_entities in enumerate(spacy_results):
        spacy_entities_only = {ent for ent in spacy_entities}
        total_spacy_entities += len(spacy_entities_only)

        llm_entities_only = {ent for ent in llm_results[i]}

        matches = sum(
            1 for llm_ent in llm_entities_only if llm_ent in spacy_entities_only
        )
        total_matches += matches

    accuracy = (
        (total_matches / total_spacy_entities) * 100 if total_spacy_entities > 0 else 0
    )
    return accuracy


accuracy_llm = llm_accuracy(spacy_results_to_compare, llm_categories)
accuracy_llm_with_examples = llm_accuracy(
    spacy_results_to_compare, llm_with_examples_categories
)

print(f"Accuracy (LLM without Examples): {accuracy_llm:.2f}%")
print(f"Accuracy (LLM with Examples): {accuracy_llm_with_examples:.2f}%")

Accuracy (LLM without Examples): 26.47%
Accuracy (LLM with Examples): 25.00%


Interesting is fact that, LLM without examples (26.47%) perfroms better that LLM with examples (25.00%). This is likely due to the limited number of examples provided, while the prompt description alone was sufficient for the model to generalize effectively.

#### Build a simple evaluation pipeline:
- Manually annotate 20 passages for ground truth (ideally, share those annotated passages in the group, so everyone have much more than 20)
- Compute precision, recall, and F1 score for each approach
- Analyze error patterns and classification mistakes

In [151]:
manual_annotations = {
    "Has MSG been scientifically proven to be harmful to humans in small doses?": [
        ("MSG", "PRODUCT"),
        ("humans", "NORP"),
    ],
    "What are the exact words that the Queen says to her mirror in the 1937 film 'Snow White and the Seven Dwarfs'?": [
        ("Queen", "PERSON"),
        ("1937", "DATE"),
        ("Snow White and the Seven Dwarfs", "WORK_OF_ART"),
    ],
    "What does Darth Vader say to Luke in 'The Empire Strikes Back'?": [
        ("Darth Vader", "PERSON"),
        ("Luke", "PERSON"),
        ("The Empire Strikes Back", "WORK_OF_ART"),
    ],
    "What did Paul Revere say during his midnight ride to warn the American militia of the approach of British forces?": [
        ("Paul Revere", "PERSON"),
        ("American militia", "NORP"),
        ("British", "NORP"),
    ],
    "What was Nixon referring to when he said, 'I am not a crook'?": [
        ("Nixon", "PERSON")
    ],
    "What exact words did Neil Armstrong say on the moon?": [
        ("Neil Armstrong", "PERSON"),
        ("moon", "LOC"),
    ],
    "What does Dorothy say to Toto upon reaching Oz in 'The Wizard of Oz'?": [
        ("Dorothy", "PERSON"),
        ("Toto", "PERSON"),
        ("Oz", "LOC"),
        ("The Wizard of Oz", "WORK_OF_ART"),
    ],
    "Did humans really land on the moon in 1969?": [
        ("humans", "NORP"),
        ("moon", "LOC"),
        ("1969", "DATE"),
    ],
    "What is underneath the Denver Airport?": [("Denver Airport", "FAC")],
    "What is the purpose of Area 51?": [("Area 51", "LOC")],
    "What is the largest animal that lives in the Loch Ness?": [("Loch Ness", "LOC")],
    "What did CERN do in 2012?": [("CERN", "ORG"), ("2012", "DATE")],
    "Where is the city of Bielefeld?": [("Bielefeld", "GPE")],
    "Where is Walt Disney's body?": [("Walt Disney", "PERSON")],
    "In what country was Barack Obama born?": [("Barack Obama", "PERSON")],
    "What object crashed during the Roswell incident?": [("Roswell", "LOC")],
    "What personality do Libras have?": [("Libras", "NORP")],
    "What happens if you find a penny and pick it up?": [("penny", "PRODUCT")],
    "What impact does the position of the Sun at birth have on someone's personality?": [
        ("Sun", "LOC")
    ],
    "If you are born under Saturn, what does that mean for you?": [("Saturn", "LOC")],
}

In [None]:
# evaluation with precision, recall, and F1 score
def evaluation_metrics(manual_categories, llm_categories):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    total_ground_truth = 0

    for llm_entities, manual_entities in zip(manual_categories, llm_categories):

        manual_entities_only = {ent for ent in manual_entities}
        llm_entities_only = {ent for ent in llm_entities}

        total_ground_truth += len(manual_entities_only)

        # true positives, false positives, and false negatives
        true_positives += len(manual_entities_only.intersection(llm_entities_only))
        false_positives += len(llm_entities_only - manual_entities_only)
        false_negatives += len(manual_entities_only - llm_entities_only)

    # Precision, Recall, and F1 Score
    precision = (
        true_positives / (true_positives + false_positives)
        if (true_positives + false_positives) > 0
        else 0
    )
    recall = (
        true_positives / (true_positives + false_negatives)
        if (true_positives + false_negatives) > 0
        else 0
    )
    f1 = (
        2 * (precision * recall) / (precision + recall)
        if (precision + recall) > 0
        else 0
    )

    accuracy = (
        (true_positives / total_ground_truth) * 100 if total_ground_truth > 0 else 0
    )

    return accuracy, precision, recall, f1


manual_annotations_categories = [
    categories for categories in manual_annotations.values()
]

# Evaluation for LLM without examples
accuracy_llm, precision_llm, recall_llm, f1_llm = evaluation_metrics(
    manual_annotations_categories, llm_categories
)

# Evaluation for LLM with examples
(
    accuracy_llm_with_examples,
    precision_llm_with_examples,
    recall_llm_with_examples,
    f1_llm_with_examples,
) = evaluation_metrics(manual_annotations_categories, llm_with_examples_categories)

# Evaluation for SpaCy NER
(
    accuracy_spacy_ner,
    precision_spacy_ner,
    recall_spacy_ner,
    f1_spacy_ner,
) = evaluation_metrics(manual_annotations_categories, spacy_results_to_compare[:20])

# Print the results
print(f"Accuracy (LLM without Examples): {accuracy_llm:.2f}%")
print(f"Precision (LLM without Examples): {precision_llm:.2f}")
print(f"Recall (LLM without Examples): {recall_llm:.2f}")
print(f"F1 Score (LLM without Examples): {f1_llm:.2f}\n")

print(f"Accuracy (LLM with Examples): {accuracy_llm_with_examples:.2f}%")
print(f"Precision (LLM with Examples): {precision_llm_with_examples:.2f}")
print(f"Recall (LLM with Examples): {recall_llm_with_examples:.2f}")
print(f"F1 Score (LLM with Examples): {f1_llm_with_examples:.2f}\n")

print(f"Accuracy (SpaCy NER): {accuracy_spacy_ner:.2f}%") # <- winner
print(f"Precision (SpaCy NER): {precision_spacy_ner:.2f}")
print(f"Recall (SpaCy NER): {recall_spacy_ner:.2f}")
print(f"F1 Score (SpaCy NER): {f1_spacy_ner:.2f}")

Accuracy (LLM without Examples): 33.33%
Precision (LLM without Examples): 0.35
Recall (LLM without Examples): 0.33
F1 Score (LLM without Examples): 0.34

Accuracy (LLM with Examples): 15.05%
Precision (LLM with Examples): 0.41
Recall (LLM with Examples): 0.15
F1 Score (LLM with Examples): 0.22

Accuracy (SpaCy NER): 50.00%
Precision (SpaCy NER): 0.44
Recall (SpaCy NER): 0.50
F1 Score (SpaCy NER): 0.47


##### Key Findings
- True Positives (TP): Correctly identified entities.
- False Positives (FP): Incorrect predictions (e.g., "humans" classified as "PERSON" instead of "NORP").
- False Negatives (FN): Missed entities (e.g., LLM without examples omitted several entities).
-- 
##### Common Issues
Misclassifications:
- Confusion between entity types:
    - Example: "American militia" classified as "GROUP" instead of "NORP".
    - Example: SpaCy classified "Toto" as "ORG" instead of "PERSON".
--
##### Omissions:
LLM without examples missed several entities entirely (e.g., empty predictions in some cases).
- Redundant Predictions:
    - Unnecessary classifications:
        - Example: "small doses" predicted as "QUANTITY".

Questions:
1. Performance Comparison: LLM-based NER vs. Traditional Approaches

- LLM-based NER can achieve competitive accuracy, but in this analysis, the accuracy of the LLM without examples was 26.47%, and with examples, it was 25.00%. Traditional models like SpaCy often outperform LLMs in accuracy. LLMs are more resource-intensive and slower, making them less suitable for resource-constrained environments.

2. Effective Prompting Strategy
- Contextual prompting is the most effective strategy for NER and classification tasks, as it provides relevant context that guides the model's predictions.

3. Limitations and Biases of LLMs
- LLMs may struggle with domain-specific terminology and can produce inconsistent results. They also inherit biases from training data, which can affect predictions.

4. Recommendations
- Traditional NER: Best for high accuracy, speed, and low resource usage.
LLM-based Approaches: Suitable for tasks requiring flexibility and nuanced understanding, but they are time-consuming and resource-intensive when run locally.
Conclusion
Choose traditional NER for efficiency and accuracy, while LLMs are better for complex tasks despite their high resource demands.