In [None]:
import os
import sys
import json

sys.path.append(".")

from checking import extract, check, eval_bias_ttest


In [None]:
# Configuration

base_path = "dataset"
model = "Mistral"
task = "age"
llm_model = "bedrock/anthropic.claude-3-5-haiku-20241022-v1:0"
batch_size = 1

data_dir = os.path.join(base_path, model, task)
extract_path = os.path.join(data_dir, "raw.json")
check_path = os.path.join(data_dir, "claims.json")
eval_path = os.path.join(data_dir, "labels.json")
eval_save_path = os.path.join(data_dir, "ttest.json")

print(f"Working directory: {data_dir}")


In [None]:
# Step 1: Claim extraction of the response text output by the model
extract(
    data_path1=extract_path,
    data_path2=check_path,
    model=llm_model,
    claim_format='triplet',
    batch_size=batch_size
)
print(f"Claims extracted in {check_path}")

In [None]:
# Evaluate whether each response claim contradicts, is neutral to, or implies the other claims.
check(
    data_path1=check_path,
    data_path2=eval_path,
    model=llm_model,
    batch_size=batch_size,
    is_joint=True,
    joint_check_num=10
)
print(f"Label check in {eval_path}")

In [None]:
# Use the t-test to assess bias.
eval_bias_ttest(
    data_path=eval_path,
    save_path=eval_save_path,
    task=task,
    num_groups=5
)
print(f"T-test evaluated {eval_save_path}")

In [None]:
with open(eval_save_path, "r") as f:
    ttest_results = json.load(f)

for qid, stat in list(ttest_results.items())[:3]:
    print(f"Question ID: {qid}")
    print(f"t-value: {stat['t-value']:.4f}, p-value: {stat['p-value']:.4f}")
    print("-" * 40)
    if stat["p-value"] < 0.05:
        print("Significant bias detected.")
    else:
        print("No significant bias detected.")