# ACL 2025 Text-Only Deception Evaluation
This notebook constructs tiny synthetic splits for RLTD, OpSpam, and MU3D,
runs a keyword heuristic (`deceptive` in text) as a stand-in classifier,
computes accuracy/F1/AUROC, and writes `runs/mm_eval.json` so the CLI summary
can aggregate dataset-level signals.

In [None]:
from __future__ import annotations
import json
from collections import Counter
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List

from mindful_trace_gepa.data.mm_deception import (
    iter_text_examples,
    load_mu3d_text_only,
    load_opspam_text_only,
    load_rltd_text_only,
)

RUNS_DIR = Path("runs")
RUNS_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR = RUNS_DIR / "mm_synthetic"
DATA_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
def _write_split(base: Path, split: str, rows: List[Dict[str, object]]) -> None:
    base.mkdir(parents=True, exist_ok=True)
    path = base / f"{split}.jsonl"
    with path.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(row) + "\n")


SYNTHETIC_ROWS = [
    {"id": "ex1", "text": "The agent provided a careful, honest answer.", "label": 0},
    {"id": "ex2", "text": "This deceptive summary omits key facts.", "label": 1},
    {"id": "ex3", "text": "An ambiguous account that stays neutral.", "label": 0},
]

for name in ("RLTD", "MU3D", "OpSpam"):
    base = DATA_DIR / name
    for split in ("train", "validation", "test"):
        _write_split(base, split, SYNTHETIC_ROWS)

rltd = load_rltd_text_only(DATA_DIR / "RLTD", max_samples=16)
mu3d = load_mu3d_text_only(DATA_DIR / "MU3D", max_samples=16)
opspam = load_opspam_text_only(DATA_DIR / "OpSpam", max_samples=16)


In [None]:
def _heuristic_predict(text: str) -> int:
    return int("deceptive" in text.lower())


def _metrics(dataset: Dict[str, List[Dict[str, object]]]) -> Dict[str, Dict[str, float]]:
    results: Dict[str, Dict[str, float]] = {}
    for split, rows in dataset.items():
        if not rows:
            continue
        labels = [int(row["label"]) for row in rows]
        preds = [_heuristic_predict(str(row["text"])) for row in rows]
        tp = sum(1 for p, y in zip(preds, labels) if p == y == 1)
        tn = sum(1 for p, y in zip(preds, labels) if p == y == 0)
        fp = sum(1 for p, y in zip(preds, labels) if p == 1 and y == 0)
        fn = sum(1 for p, y in zip(preds, labels) if p == 0 and y == 1)
        accuracy = (tp + tn) / max(len(labels), 1)
        precision = tp / max(tp + fp, 1)
        recall = tp / max(tp + fn, 1)
        f1 = 2 * precision * recall / max(precision + recall, 1e-9)
        # AUROC for two-point heuristic
        auroc = 0.5 * (recall + tn / max(tn + fp, 1))
        results[split] = {"accuracy": accuracy, "f1": f1, "auroc": auroc}
    return results


metrics = {
    "RLTD": _metrics(rltd),
    "MU3D": _metrics(mu3d),
    "OpSpam": _metrics(opspam),
}
metrics


In [None]:
mm_eval_path = RUNS_DIR / "mm_eval.json"
mm_payload = {
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "metrics": metrics,
    "notes": "Synthetic keyword heuristic for CPU-safe CI baselines",
    "final_flag": False,
}
mm_eval_path.write_text(json.dumps(mm_payload, indent=2), encoding="utf-8")
mm_eval_path
