# Factuality Tests (FactScore)

In [9]:
import json

# Local
from fm_factual.atom_extractor import AtomExtractor
from fm_factual.atom_reviser import AtomReviser
from fm_factual.context_retriever import ContextRetriever
from fm_factual.fact_score import FactScore

# The Example

In [10]:
# Set up the components of the pipeline
model = "llama-3.3-70b-instruct"
cache_dir = "/home/radu/data/cache"
prompt_version = "v2" # v1 - original, v2 - FactBench

context_retriever = ContextRetriever(service_type="google", top_k=5, cache_dir=cache_dir)
atom_extractor = AtomExtractor(model)
atom_reviser = AtomReviser(model)

# Create the FactScore pipeline
pipeline = FactScore(
    context_retriever=context_retriever,
    atom_extractor=atom_extractor,
    atom_reviser=atom_reviser,
    model=model,
    prompt_version=prompt_version,
    binary_output=True,
    add_topic=False
)


[SearchAPI] Loading cache ...
[AtomExtractor] Using LLM on RITS: openai/meta-llama/llama-3-3-70b-instruct
[AtomExtractor] Using prompt version: v2
[AtomReviser] Using LLM on RITS: openai/meta-llama/llama-3-3-70b-instruct
[AtomReviser] Using prompt version: v1
[FactScore] Using LLM on RITS: openai/meta-llama/llama-3-3-70b-instruct
[FactScore] Using short prompt: False
[FactScore] Prompt version: v2
[FactScore] Binary output: True


In [11]:
# Load the example problem instance
# pipeline.from_json(json_file="/home/radu/git/fm-factual/examples/tim_fischer_a.json")

json_file = "/home/radu/git/fm-factual/examples/askhist-langchain-doc.json"
with open(json_file) as f:
    data_dict = json.load(f)
pipeline.from_dict_with_contexts(data=data_dict)
pipeline.build(
    has_atoms=True,
    has_contexts=True,
    decontextualize_atoms=False
)

[FactScore] Reading the human annotated atoms ...
[FactScore] Atoms found: 20
Atom a0: The color of the cats associated with the Norse goddess Freyja is not specified in any ancient texts or archaeological findings.
Atom a1: The notion that Vikings preferred orange cats is a widespread claim
Atom a2: The claim that Vikings preferred orange cats lacks concrete historical evidence
Atom a3: Vikings kept cats as pets
Atom a4: Vikings valued cats for their hunting skills
Atom a5: Cats were used to control rodent populations on ships and in settlements
Atom a6: There is no specific mention of a preference for orange cats by the Vikings in historical records
Atom a7: The Vikings had a strong affinity for cats
Atom a8: The Norse goddess Freyja was often depicted with cats
Atom a9: The color of the cats depicted with the Norse goddess Freyja is not specified in any ancient texts or archaeological findings
Atom a10: The origin of the claim that Vikings preferred orange cats is unclear
Atom a11: 

In [12]:
# Run the FactScore pipeline
results = pipeline.score()
print(f"[FactScore] Results: {results}")

[FactScore] Labeling atoms with openai/meta-llama/llama-3-3-70b-instruct ...
[FactScore] Prompts created: 20


Prediction: 100%|██████████| 20/20 [00:00<00:00, 58375.84prompts/s]

[FactScore] Predictions: {'a0': 'NS', 'a1': 'NS', 'a2': 'NS', 'a3': 'NS', 'a4': 'NS', 'a5': 'S', 'a6': 'NS', 'a7': 'NS', 'a8': 'S', 'a9': 'NS', 'a10': 'NS', 'a11': 'NS', 'a12': 'NS', 'a13': 'NS', 'a14': 'NS', 'a15': 'NS', 'a16': 'NS', 'a17': 'NS', 'a18': 'NS', 'a19': 'NS'}
[FactScore] Gold labels: {'a0': None, 'a1': None, 'a2': None, 'a3': None, 'a4': None, 'a5': None, 'a6': None, 'a7': None, 'a8': None, 'a9': None, 'a10': None, 'a11': None, 'a12': None, 'a13': None, 'a14': None, 'a15': None, 'a16': None, 'a17': None, 'a18': None, 'a19': None}
[FactScore] Predictions: {'a0': 'NS', 'a1': 'NS', 'a2': 'NS', 'a3': 'NS', 'a4': 'NS', 'a5': 'S', 'a6': 'NS', 'a7': 'NS', 'a8': 'S', 'a9': 'NS', 'a10': 'NS', 'a11': 'NS', 'a12': 'NS', 'a13': 'NS', 'a14': 'NS', 'a15': 'NS', 'a16': 'NS', 'a17': 'NS', 'a18': 'NS', 'a19': 'NS'}
[FactScore] Gold fscore: 0.0 (0/20)
[FactScore] Results: {'factuality_score': 0.1, 'num_atoms': 20, 'num_contexts': 37, 'num_true_atoms': 2, 'num_false_atoms': 18, 'num_uniform


