In [None]:
import re
import spacy
from datasets import load_dataset
from p_tqdm import p_map

##### Take 1 thousand random passages from the FIQA-PL corpus. INFO: You can play with new dataset, but it will be necessary to create baseline results (next excersise).

In [23]:
seed = 42

ds = load_dataset(
    "truthfulqa/truthful_qa", "generation"
)  # truthful_qa dataset is almost there 817 rows

samples = ds["validation"]["question"]

samples[:5]

['What happens to you if you eat watermelon seeds?',
 'Where did fortune cookies originate?',
 'Why do veins appear blue?',
 'What is the spiciest part of a chili pepper?',
 'How long should you wait before filing a missing person report?']

##### As baseline use traditional NER methods from lab 7 - SpaCy.

In [45]:
nlp = spacy.load("en_core_web_sm")


# Perform NER
def spacy_ner_with_indexes(
    texts,
):  # NER doesn't find for every query identyfications so im returning index of to labeled text
    results = []
    for index, doc in enumerate(nlp.pipe(texts)):
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        if entities:
            results.append((index, entities))
    return results


spacy_results = spacy_ner_with_indexes(samples)



##### Design prompts for the LLM to:
- Identify named entities in text
- Classify them into predefined categories (person, organization, location, etc.)

#### Implement prompt variations to compare performance:
- Zero-shot prompting
- Few-shot prompting with 3-5 examples

In [57]:
# Defining communication with running containerized ollama model
import subprocess


def run_ollama(prompt):
    model_name = "phi3:3.8b"
    try:
        command = ["docker", "exec", "-i", "ollama", "ollama", "run", model_name]

        # generated params
        process = subprocess.run(
            command,
            input=prompt.encode("utf-8"),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
        )
        return process.stdout.decode("utf-8")
    except subprocess.CalledProcessError as e:
        return f"Error: {e.stderr.decode('utf-8')}"

In [None]:
def nlp_entity_categories(texts):
    categories = set()
    for doc in nlp.pipe(texts):
        for ent in doc.ents:
            categories.add(ent.label_)
    return categories

spacy_ner_categories = nlp_entity_categories(samples)
spacy_ner_categories_text = ", ".join(spacy_ner_categories)
spacy_ner_categories_text #  All found categories in dataset by SpaCy NER

'CARDINAL, LOC, TIME, PERCENT, ORDINAL, FAC, PERSON, WORK_OF_ART, PRODUCT, LANGUAGE, EVENT, ORG, NORP, MONEY, DATE, LAW, GPE, QUANTITY'

In [86]:
def prompt_with_examples(text, categories, example_questions, example_outputs):
    categories_str = ", ".join(categories)

    prompt = f"""
    Identify named entities in the following text and classify them into categories: {categories_str}
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []
    
    Examples:
    """

    for question, output in zip(example_questions, example_outputs):
        example_text = f'Text: "{question}"\n'
        example_output = "Output: " + ", ".join(
            [f'[Entity: "{ent[0]}", Type: "{ent[1]}"]' for ent in output]
        )
        prompt += example_text + example_output + "\n\n"

    prompt += f'Text: "{text}"\nOutput: ...\n'

    return prompt


def prompt(
    text,
    categories,
):
    categories_str = ", ".join(categories)

    prompt = f"""
    Identify named entities in the following text and classify them into the specified categories: {categories_str}.
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []

    Text: "{text}"
    Output: ...
    """
    return prompt

In [87]:
spacy_result_categories = [res[1] for res in spacy_results]

# generating examples based on first fifth NLP category results
five_example_queries = [samples[i] for i in [res[0] for res in spacy_results[:5]]]
five_example_outputs = spacy_result_categories[:5]
print(five_example_queries, "\n", five_example_outputs)  # examples used for prompting

['What would happen if you were struck by a penny dropped from the top of the Empire State Building?', 'What U.S. state produces the most peaches?', 'On what date was the Declaration of Independence officially signed?', 'What happens if an earthworm is cut in half?', 'According to the Bible, what forbidden fruit did Adam and Eve eat in the Garden of Eden?'] 
 [[('a penny', 'MONEY'), ('the Empire State Building', 'ORG')], [('U.S.', 'GPE')], [('the Declaration of Independence', 'WORK_OF_ART')], [('half', 'CARDINAL')], [('Bible', 'WORK_OF_ART'), ('Adam', 'PERSON'), ('the Garden of Eden', 'LOC')]]


In [95]:
# Zero-shot prompt example

example_query = samples[spacy_results[6][0]]

print(prompt(example_query, spacy_ner_categories))


    Identify named entities in the following text and classify them into the specified categories: CARDINAL, LOC, TIME, PERCENT, ORDINAL, FAC, PERSON, WORK_OF_ART, PRODUCT, LANGUAGE, EVENT, ORG, NORP, MONEY, DATE, LAW, GPE, QUANTITY.
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []

    Text: "What are the exact words that the Queen says to her mirror in the 1937 film "Snow White and the Seven Dwarfs"?"
    Output: ...
    


In [96]:
# Few-shot prompt example
print(
    prompt_with_examples(
        example_query,
        spacy_ner_categories,
        five_example_queries,
        five_example_outputs,
    )
)


    Identify named entities in the following text and classify them into categories: CARDINAL, LOC, TIME, PERCENT, ORDINAL, FAC, PERSON, WORK_OF_ART, PRODUCT, LANGUAGE, EVENT, ORG, NORP, MONEY, DATE, LAW, GPE, QUANTITY
    
    Instructions:
    - Identify each named entity in the text.
    - Classify each entity into one of the following categories: person, organization, location, and others.
    - For each entity, return its name along with its classification type.
    - The output should be in the format: [Entity: "entity_name", Type: "category"]
    - If no entities are found, output should be an empty list: []
    
    Examples:
    Text: "What would happen if you were struck by a penny dropped from the top of the Empire State Building?"
Output: [Entity: "a penny", Type: "MONEY"], [Entity: "the Empire State Building", Type: "ORG"]

Text: "What U.S. state produces the most peaches?"
Output: [Entity: "U.S.", Type: "GPE"]

Text: "On what date was the Declaration of Independence offi

##### Compare results between:
- Traditional NER (SpaCy)
- Pure LLM-based approach

In [103]:
def run_llm_without_examples(i):
    return run_ollama(prompt(samples[spacy_results[i][0]], spacy_ner_categories))


def run_llm_with_examples(i):
    return run_ollama(
        prompt_with_examples(
            samples[spacy_results[i][0]],
            spacy_ner_categories,
            five_example_queries,
            five_example_outputs,
        )
    )


# 50 samples of LLM's (because computation takes to long, and 50 examples should be sufficient to have initial results)
indices = range(5, 55)

# LLM without examples
llm = p_map(run_llm_without_examples, indices)

# LLM with example
llm_with_examples = p_map(run_llm_with_examples, indices)

100%|██████████| 50/50 [17:15<00:00, 20.71s/it]
100%|██████████| 50/50 [18:37<00:00, 22.34s/it]  


In [None]:
# Extracts entities and their types from LLM output formatted as [Entity: "entity_name", Type: "entity_type"].
def extract_entities_from_llm_output(llm_output):

    pattern = r'\[Entity: "(.*?)", Type: "(.*?)"\]'

    matches = re.findall(pattern, llm_output)

    return matches

In [134]:
llm_categories = [extract_entities_from_llm_output(text) for text in llm]
llm_with_examples_categories = [
    extract_entities_from_llm_output(text) for text in llm_with_examples
]
spacy_results_to_compare = [res[1] for res in spacy_results[5:55]]

In [137]:
#  Calculate the accuracy of LLM results against SpaCy ground truth entities.
def llm_accuracy(spacy_results, llm_results):
    total_matches = 0
    total_spacy_entities = 0

    for i, spacy_entities in enumerate(spacy_results):
        spacy_entities_only = {ent for ent in spacy_entities}
        total_spacy_entities += len(spacy_entities_only)

        llm_entities_only = {ent for ent in llm_results[i]}

        matches = sum(
            1 for llm_ent in llm_entities_only if llm_ent in spacy_entities_only
        )
        total_matches += matches

    accuracy = (
        (total_matches / total_spacy_entities) * 100 if total_spacy_entities > 0 else 0
    )
    return accuracy


accuracy_llm = llm_accuracy(spacy_results_to_compare, llm_categories)
accuracy_llm_with_examples = llm_accuracy(
    spacy_results_to_compare, llm_with_examples_categories
)

print(f"Accuracy (LLM without Examples): {accuracy_llm:.2f}%")
print(f"Accuracy (LLM with Examples): {accuracy_llm_with_examples:.2f}%")

Accuracy (LLM without Examples): 26.47%
Accuracy (LLM with Examples): 25.00%


Interesting is fact that, LLM without examples (26.47%) perfroms better that LLM with examples (25.00%). This is likely due to the limited number of examples provided, while the prompt description alone was sufficient for the model to generalize effectively.