# Context-Anchored Trajectory Analysis: HaluEval Validation

**Key Insight from TruthfulQA Failure:**

TruthfulQA tests *misconceptions* - plausible-sounding false beliefs. These are semantically CLOSE to the question because they're the naive, expected response.

**Refined Hypothesis:**

For RAG/grounded generation, measure distance from **CONTEXT**, not question:

- Fabrications: claims not entailed by context → launch far from context embedding
- Valid responses: paraphrase/summarize context → stay close to context embedding

HaluEval provides (question, knowledge/context, answer, hallucination_label) - exactly what we need.

In [None]:
#!pip install -q datasets sentence-transformers spacy numpy pandas matplotlib seaborn scipy scikit-learn
#!python -m spacy download en_core_web_sm -q

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataclasses import dataclass
from typing import Optional, List, Dict
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

import spacy
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from scipy import stats
from sklearn.metrics import roc_auc_score, roc_curve

print("Libraries loaded.")

In [None]:
MODEL_NAME = "all-mpnet-base-v2"
print(f"Loading embedding model: {MODEL_NAME}")
nlp = spacy.load("en_core_web_sm")
encoder = SentenceTransformer(MODEL_NAME)
print(f"Embedding dimension: {encoder.get_sentence_embedding_dimension()}")

## 1. Load HaluEval Dataset

In [None]:
print("Loading HaluEval QA dataset...")
dataset = load_dataset("pminervini/HaluEval", "qa_samples", split="data")

print(f"Dataset size: {len(dataset)}")
print(f"Columns: {dataset.column_names}")

# Check hallucination field values
halluc_values = {}
for item in dataset:
    val = str(item.get("hallucination", "MISSING"))
    halluc_values[val] = halluc_values.get(val, 0) + 1
print(f"\nHallucination field distribution: {halluc_values}")

# Show example
print("\n=== EXAMPLE ===")
ex = dataset[0]
for k, v in ex.items():
    print(f"{k}: {str(v)[:100]}..." if len(str(v)) > 100 else f"{k}: {v}")

In [None]:
def prepare_halueval_cases(dataset, max_samples: int = 500) -> List[Dict]:
    """
    Convert HaluEval to test cases.
    Handles yes/no, 0/1, true/false formats for hallucination field.
    """
    cases = []

    for i, item in enumerate(dataset):
        if len(cases) >= max_samples:
            break

        if not item.get("knowledge") or not item.get("answer"):
            continue

        if len(item["answer"]) < 20:
            continue

        # Parse hallucination field - handles multiple formats
        halluc_val = str(item.get("hallucination", "")).lower().strip()
        is_hallucinated = halluc_val in ["yes", "1", "true"]
        is_valid_answer = halluc_val in ["no", "0", "false"]

        if not is_hallucinated and not is_valid_answer:
            continue

        cases.append({
            "id": f"halueval_{i}",
            "question": item["question"],
            "context": item["knowledge"],
            "output": item["answer"],
            "is_valid": is_valid_answer,
            "source": "halueval"
        })

    return cases

MAX_SAMPLES = 1000
ALL_CASES = prepare_halueval_cases(dataset, max_samples=MAX_SAMPLES)

print(f"\nPrepared {len(ALL_CASES)} test cases:")
print(f"  Valid (grounded): {sum(1 for c in ALL_CASES if c['is_valid'])}")
print(f"  Hallucinated: {sum(1 for c in ALL_CASES if not c['is_valid'])}")

In [None]:
# Examine examples
valid_cases = [c for c in ALL_CASES if c['is_valid']]
halluc_cases = [c for c in ALL_CASES if not c['is_valid']]

if valid_cases:
    print("=== VALID (GROUNDED) EXAMPLE ===")
    ex = valid_cases[0]
    print(f"Question: {ex['question'][:100]}...")
    print(f"Context: {ex['context'][:150]}...")
    print(f"Answer: {ex['output'][:150]}...")
else:
    print("WARNING: No valid cases found!")

if halluc_cases:
    print("\n=== HALLUCINATED EXAMPLE ===")
    ex = halluc_cases[0]
    print(f"Question: {ex['question'][:100]}...")
    print(f"Context: {ex['context'][:150]}...")
    print(f"Answer: {ex['output'][:150]}...")
else:
    print("WARNING: No hallucinated cases found!")

## 2. Context-Anchored Trajectory Metrics

In [None]:
@dataclass
class ContextAnchoredMetrics:
    launch_from_context: float
    mean_distance_from_context: float
    max_distance_from_context: float
    final_distance_from_context: float
    launch_from_question: float
    mean_distance_from_question: float
    grounding_ratio: float
    context_question_alignment: float
    path_length: float
    efficiency: float
    return_to_context_ratio: float
    mean_angular_deviation: float
    num_claims: int
    context_length: int

    def to_dict(self) -> dict:
        return self.__dict__

In [None]:
def extract_claims(text: str) -> List[str]:
    doc = nlp(text)
    return [s.text.strip() for s in doc.sents if len(s.text.strip()) > 5]


def compute_context_anchored_metrics(question: str, context: str, output: str) -> Optional[ContextAnchoredMetrics]:
    claims = extract_claims(output)
    if len(claims) == 0:
        return None

    if len(claims) == 1:
        claims = [claims[0], claims[0]]

    ctx_emb = encoder.encode(context)
    q_emb = encoder.encode(question)
    claim_embeddings = [encoder.encode(c) for c in claims]

    trajectory = [ctx_emb] + claim_embeddings
    momenta = [trajectory[i+1] - trajectory[i] for i in range(len(trajectory)-1)]

    launch_from_context = np.linalg.norm(momenta[0])
    distances_from_ctx = [np.linalg.norm(emb - ctx_emb) for emb in claim_embeddings]

    launch_from_question = np.linalg.norm(claim_embeddings[0] - q_emb)
    distances_from_q = [np.linalg.norm(emb - q_emb) for emb in claim_embeddings]

    grounding_ratio = launch_from_context / (launch_from_question + 1e-8)

    ctx_to_q = q_emb - ctx_emb
    ctx_to_claim = momenta[0]
    norm_product = np.linalg.norm(ctx_to_q) * np.linalg.norm(ctx_to_claim)
    context_question_alignment = np.dot(ctx_to_q, ctx_to_claim) / norm_product if norm_product > 1e-8 else 0.0

    path_length = sum(np.linalg.norm(m) for m in momenta)
    direct_distance = np.linalg.norm(trajectory[-1] - trajectory[0])
    efficiency = direct_distance / (path_length + 1e-8)
    return_to_context_ratio = distances_from_ctx[-1] / (max(distances_from_ctx) + 1e-8)

    angles = []
    for i in range(len(momenta) - 1):
        norm_product = np.linalg.norm(momenta[i]) * np.linalg.norm(momenta[i+1])
        if norm_product > 1e-8:
            cos_angle = np.dot(momenta[i], momenta[i+1]) / norm_product
            angles.append(np.arccos(np.clip(cos_angle, -1, 1)))

    return ContextAnchoredMetrics(
        launch_from_context=launch_from_context,
        mean_distance_from_context=np.mean(distances_from_ctx),
        max_distance_from_context=np.max(distances_from_ctx),
        final_distance_from_context=distances_from_ctx[-1],
        launch_from_question=launch_from_question,
        mean_distance_from_question=np.mean(distances_from_q),
        grounding_ratio=grounding_ratio,
        context_question_alignment=context_question_alignment,
        path_length=path_length,
        efficiency=efficiency,
        return_to_context_ratio=return_to_context_ratio,
        mean_angular_deviation=np.mean(angles) if angles else 0.0,
        num_claims=len(claims),
        context_length=len(context)
    )

## 3. Run Analysis

In [None]:
def analyze_all_cases(cases: List[Dict]) -> pd.DataFrame:
    results = []
    failed = 0

    for case in tqdm(cases, desc="Analyzing trajectories"):
        try:
            metrics = compute_context_anchored_metrics(
                question=case["question"],
                context=case["context"],
                output=case["output"]
            )
            if metrics is None:
                failed += 1
                continue

            result = metrics.to_dict()
            result["id"] = case["id"]
            result["is_valid"] = case["is_valid"]
            results.append(result)
        except:
            failed += 1

    print(f"\nProcessed {len(results)} cases, {failed} failed.")
    return pd.DataFrame(results)

print(f"Analyzing {len(ALL_CASES)} cases...")
df = analyze_all_cases(ALL_CASES)

print(f"\nFinal: {len(df)} total, {len(df[df['is_valid']])} valid, {len(df[~df['is_valid']])} hallucinated")

## 4. Statistical Analysis

In [None]:
METRICS = ["launch_from_context", "mean_distance_from_context", "max_distance_from_context",
           "final_distance_from_context", "launch_from_question", "mean_distance_from_question",
           "grounding_ratio", "context_question_alignment", "efficiency",
           "return_to_context_ratio", "mean_angular_deviation"]

EXPECTED = {
    "launch_from_context": "higher",
    "mean_distance_from_context": "higher",
    "max_distance_from_context": "higher",
    "final_distance_from_context": "higher",
    "launch_from_question": "lower",
    "mean_distance_from_question": "lower",
    "grounding_ratio": "higher",
    "context_question_alignment": "lower",
    "efficiency": "lower",
    "return_to_context_ratio": "higher",
    "mean_angular_deviation": "higher"
}

def compute_stats(df, metric):
    valid = df[df["is_valid"]][metric].dropna()
    halluc = df[~df["is_valid"]][metric].dropna()

    diff = halluc.mean() - valid.mean()
    pooled_std = np.sqrt((valid.std()**2 + halluc.std()**2) / 2)
    cohens_d = diff / pooled_std if pooled_std > 1e-8 else 0
    _, p_value = stats.ttest_ind(halluc, valid, equal_var=False)

    expected = EXPECTED.get(metric)
    confirmed = (diff > 0 and p_value < 0.05) if expected == "higher" else (diff < 0 and p_value < 0.05) if expected == "lower" else None

    return {"metric": metric, "valid_mean": valid.mean(), "halluc_mean": halluc.mean(),
            "diff": diff, "cohens_d": cohens_d, "p_value": p_value,
            "significant": p_value < 0.05, "confirmed": confirmed, "expected": expected}

In [None]:
print("=" * 100)
print("CONTEXT-ANCHORED TRAJECTORY ANALYSIS: HaluEval")
print("=" * 100)
print(f"\nSamples: {len(df)} ({len(df[df['is_valid']])} valid, {len(df[~df['is_valid']])} hallucinated)")
print(f"\nKey Hypothesis: Fabricated claims launch FURTHER from context than grounded claims.\n")

stats_results = [compute_stats(df, m) for m in METRICS if m in df.columns]

print("-" * 100)
print(f"{'Metric':<35} | {'Valid':>8} | {'Halluc':>8} | {'Diff':>8} | {'d':>7} | {'p-value':>10} | Hyp")
print("-" * 100)

for r in stats_results:
    hyp = "✓" if r["confirmed"] else "✗" if r["confirmed"] is False else "?"
    sig = "*" if r["significant"] else ""
    print(f"{r['metric']:<35} | {r['valid_mean']:>8.4f} | {r['halluc_mean']:>8.4f} | "
          f"{r['diff']:>+8.4f} | {r['cohens_d']:>+7.3f} | {r['p_value']:>10.2e}{sig} | {hyp}")

print("-" * 100)
print(f"\nConfirmed: {sum(1 for r in stats_results if r['confirmed'])}/{len(stats_results)}")
print(f"Significant: {sum(1 for r in stats_results if r['significant'])}/{len(stats_results)}")

In [None]:
print("\nMETRICS RANKED BY EFFECT SIZE")
print("=" * 80)

for i, r in enumerate(sorted(stats_results, key=lambda x: abs(x["cohens_d"]), reverse=True), 1):
    d = abs(r["cohens_d"])
    effect = "LARGE" if d >= 0.8 else "MEDIUM" if d >= 0.5 else "SMALL" if d >= 0.2 else "negligible"
    sig = "***" if r["p_value"] < 0.001 else "**" if r["p_value"] < 0.01 else "*" if r["p_value"] < 0.05 else ""
    print(f"{i:2}. {r['metric']:<35} | d = {r['cohens_d']:>+.3f} ({effect:<10}) | {sig}")

## 5. Classification Performance

In [None]:
def compute_risk_score(row):
    launch_ctx_norm = np.clip((row["launch_from_context"] - 0.3) / 0.7, 0, 1)
    mean_ctx_norm = np.clip((row["mean_distance_from_context"] - 0.3) / 0.7, 0, 1)
    grounding_norm = np.clip(row["grounding_ratio"] - 0.5, 0, 1)
    return 0.4 * launch_ctx_norm + 0.3 * mean_ctx_norm + 0.3 * grounding_norm

df["risk_score"] = df.apply(compute_risk_score, axis=1)

y_true = (~df["is_valid"]).astype(int)
y_scores = df["risk_score"]

fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = roc_auc_score(y_true, y_scores)

j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]

y_pred = (y_scores >= optimal_threshold).astype(int)
accuracy = (y_pred == y_true).mean()

tp = ((y_pred == 1) & (y_true == 1)).sum()
fp = ((y_pred == 1) & (y_true == 0)).sum()
fn = ((y_pred == 0) & (y_true == 1)).sum()
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Optimal threshold: {optimal_threshold:.4f}")
print(f"\nAccuracy: {accuracy:.1%}, Precision: {precision:.1%}, Recall: {recall:.1%}, F1: {f1:.1%}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

ax = axes[0]
ax.plot(fpr, tpr, color="#3498db", linewidth=2, label=f"ROC (AUC = {roc_auc:.3f})")
ax.plot([0, 1], [0, 1], color="gray", linestyle="--")
ax.scatter(fpr[optimal_idx], tpr[optimal_idx], color="red", s=100, zorder=5)
ax.set_xlabel("False Positive Rate"); ax.set_ylabel("True Positive Rate")
ax.set_title("ROC Curve"); ax.legend(); ax.grid(True, alpha=0.3)

ax = axes[1]
ax.hist(df[df["is_valid"]]["risk_score"], bins=30, alpha=0.6, label="Valid", color="#2ecc71", density=True)
ax.hist(df[~df["is_valid"]]["risk_score"], bins=30, alpha=0.6, label="Hallucinated", color="#e74c3c", density=True)
ax.axvline(optimal_threshold, color="black", linestyle="--", linewidth=2)
ax.set_xlabel("Risk Score"); ax.set_ylabel("Density")
ax.set_title("Score Distribution"); ax.legend(); ax.grid(True, alpha=0.3)

ax = axes[2]
ctx_effects = [next(r["cohens_d"] for r in stats_results if r["metric"] == m)
               for m in ["launch_from_context", "mean_distance_from_context"]]
q_effects = [next(r["cohens_d"] for r in stats_results if r["metric"] == m)
             for m in ["launch_from_question", "mean_distance_from_question"]]
x = np.arange(2); width = 0.35
ax.bar(x - width/2, ctx_effects, width, label="Context-anchored", color="#3498db")
ax.bar(x + width/2, q_effects, width, label="Question-anchored", color="#e67e22")
ax.set_xticks(x); ax.set_xticklabels(["Launch", "Mean Distance"])
ax.set_ylabel("Cohen's d"); ax.set_title("Effect Size Comparison")
ax.legend(); ax.axhline(0, color="black", linewidth=0.5); ax.grid(True, alpha=0.3, axis="y")

plt.suptitle(f"Classification (n={len(df)}, AUC={roc_auc:.3f})", fontsize=14)
plt.tight_layout(); plt.show()

## 6. Conclusion

In [None]:
print("=" * 80)
print("EXPERIMENT SUMMARY")
print("=" * 80)

launch_ctx = next(r for r in stats_results if r["metric"] == "launch_from_context")

print(f"""
Dataset: HaluEval QA (with context/knowledge grounding)
Samples: {len(df)} ({len(df[df['is_valid']])} valid, {len(df[~df['is_valid']])} hallucinated)

PRIMARY METRIC - launch_from_context:
  Valid: {launch_ctx['valid_mean']:.4f}
  Halluc: {launch_ctx['halluc_mean']:.4f}
  Cohen's d: {launch_ctx['cohens_d']:+.3f}
  p-value: {launch_ctx['p_value']:.2e}
  Hypothesis confirmed: {launch_ctx['confirmed']}

Classification: ROC-AUC = {roc_auc:.4f}, F1 = {f1:.1%}
""")

if launch_ctx['confirmed'] and roc_auc > 0.6:
    print("ASSESSMENT: HYPOTHESIS CONFIRMED - Context-anchored analysis detects fabrications.")
elif roc_auc > 0.55:
    print("ASSESSMENT: WEAK SIGNAL - Some discrimination but small effect sizes.")
else:
    print("ASSESSMENT: NO SIGNAL - Geometric trajectory approach not viable for hallucination detection.")