In [1]:
import json
import csv
from pathlib import Path
from collections import defaultdict
from sklearn.metrics import f1_score

In [2]:

TARGET_MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
# TARGET_MODEL_ID = "Qwen/Qwen3-4B-Thinking-2507"
# TARGET_MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

In [3]:
QUESTION_FILES = {
    "DoS":   Path("DoS_tf_qa/dos_questions.jsonl"),
    "Fuzzy": Path("Fuzzy_tf_qa/fuzzy_questions.jsonl"),
    "Gear":  Path("Gear_tf_qa/gear_questions.jsonl"),
    "RPM":   Path("RPM_tf_qa/rpm_questions.jsonl"),
}
ANSWER_DIRS = {
    "DoS":   Path("DoS_tf_qa"),
    "Fuzzy": Path("Fuzzy_tf_qa"),
    "Gear":  Path("Gear_tf_qa"),
    "RPM":   Path("RPM_tf_qa"),
}
DATASET_NAMES = ["DoS", "Fuzzy", "Gear", "RPM"]




def label_to_int(label):
    t = str(label).strip().lower()
    if t.startswith("t"):
        return 1
    if t.startswith("f"):
        return 0
    return 0


def load_questions(path: Path):
    q_map = {}
    if not path.exists():
        return q_map

    if path.suffix == ".json":
        records = json.loads(path.read_text(encoding="utf-8"))
        for rec in records:
            q_map[rec["qa_id"]] = {
                "question": rec["question"],
                "ground_truth": rec["ground_truth"],
            }
    else:  # jsonl
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                rec = json.loads(line)
                q_map[rec["qa_id"]] = {
                    "question": rec["question"],
                    "ground_truth": rec["ground_truth"],
                }
    return q_map


def load_answers_jsonl(path: Path):
    records = []
    if not path.exists():
        return records
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return records


def compute_stats(records):
    stats = defaultdict(lambda: {"total": 0, "correct": 0, "pred": [], "gold": []})
    for rec in records:
        q = rec["question"]
        pred = rec.get("llm_answer")
        gold = rec.get("ground_truth")
        stats[q]["total"] += 1
        stats[q]["correct"] += int(pred == gold)
        stats[q]["pred"].append(pred)
        stats[q]["gold"].append(gold)
    return stats


def f1_for_question(info):
    if not info["gold"]:
        return 0.0
    y_true = [label_to_int(v) for v in info["gold"]]
    y_pred = [label_to_int(v) for v in info["pred"]]
    return f1_score(y_true, y_pred, zero_division=0)




In [4]:
# model_data[dataset] = list[record]
model_data = defaultdict(list)

for ds_name in DATASET_NAMES:
    q_path = QUESTION_FILES[ds_name]
    q_map = load_questions(q_path)
    if not q_map:
        print(f"[WARN] No questions for {ds_name} at {q_path}")
        continue

    ans_dir = ANSWER_DIRS[ds_name]
    if not ans_dir.exists():
        print(f"[WARN] Answer dir for {ds_name} not found at {ans_dir}")
        continue

    for ans_path in ans_dir.glob("*answers*.jsonl"):
        ans_records = load_answers_jsonl(ans_path)
        if not ans_records:
            continue
        for rec in ans_records:
            if rec.get("model") != TARGET_MODEL_ID:
                continue
            qa_id = rec["qa_id"]
            if qa_id not in q_map:
                continue
            q_info = q_map[qa_id]
            model_data[ds_name].append({
                "dataset": ds_name,
                "question": q_info["question"],
                "ground_truth": q_info["ground_truth"],
                "llm_answer": rec.get("llm_answer"),
            })

if not model_data:
    print(f"[WARN] No records found for model: {TARGET_MODEL_ID}")
else:
    model_tag = (
    TARGET_MODEL_ID.replace("/", "_")
                   .replace(":", "_")
                   .replace("-", "_")
                   .replace(" ", "_")
    )

    output_dir = Path("LLM_TF_Performce")
    output_dir.mkdir(parents=True, exist_ok=True)

    out_path = output_dir / f"Performance_TF_{model_tag}.csv"


    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Attack", "Question", "Total", "Correct", "Accuracy", "F1"])

        combined_records = []

        for ds_name in sorted(model_data.keys()):
            records = model_data[ds_name]
            combined_records.extend(records)
            stats = compute_stats(records)

            for question, info in stats.items():
                total = info["total"]
                correct = info["correct"]
                acc = correct / total if total else 0.0
                f1 = f1_for_question(info)
                writer.writerow(
                    [ds_name, question, total, correct, f"{acc:.3%}", f"{f1:.3f}"]
                )

            ds_total = sum(info["total"] for info in stats.values())
            ds_correct = sum(info["correct"] for info in stats.values())
            y_true_all, y_pred_all = [], []
            for info in stats.values():
                y_true_all.extend(label_to_int(v) for v in info["gold"])
                y_pred_all.extend(label_to_int(v) for v in info["pred"])
            if ds_total > 0 and y_true_all:
                acc_total = ds_correct / ds_total
                f1_total = f1_score(y_true_all, y_pred_all, average="macro", zero_division=0)
            else:
                acc_total = 0.0
                f1_total = 0.0
            writer.writerow(
                [f"{ds_name}_total", "TOTAL", ds_total, ds_correct,
                 f"{acc_total:.3%}", f"{f1_total:.3f}"]
            )

        if combined_records:
            stats_c = compute_stats(combined_records)
            for question, info in stats_c.items():
                total = info["total"]
                correct = info["correct"]
                acc = correct / total if total else 0.0
                f1 = f1_for_question(info)
                writer.writerow(
                    ["Combined", question, total, correct, f"{acc:.3%}", f"{f1:.3f}"]
                )

            cb_total = sum(info["total"] for info in stats_c.values())
            cb_correct = sum(info["correct"] for info in stats_c.values())
            y_true_all, y_pred_all = [], []
            for info in stats_c.values():
                y_true_all.extend(label_to_int(v) for v in info["gold"])
                y_pred_all.extend(label_to_int(v) for v in info["pred"])
            if cb_total > 0 and y_true_all:
                acc_total = cb_correct / cb_total
                f1_total = f1_score(y_true_all, y_pred_all, average="macro", zero_division=0)
            else:
                acc_total = 0.0
                f1_total = 0.0
            writer.writerow(
                ["Combined_total", "TOTAL", cb_total, cb_correct,
                 f"{acc_total:.3%}", f"{f1_total:.3f}"]
            )

    print(f"[INFO] Saved performance for model '{TARGET_MODEL_ID}' to {out_path}")


[INFO] Saved performance for model 'Qwen/Qwen3-4B-Thinking-2507' to LLM_TF_Performce/Performance_TF_Qwen_Qwen3_4B_Thinking_2507.csv
