In [1]:
import json
import csv
from pathlib import Path
from collections import defaultdict

from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder



In [None]:
MCQ_DATASET_NAMES = ["DoS", "Fuzzy", "Gear", "RPM"]

MCQ_QUESTION_FILES = {
    "DoS":   Path("DoS_mcq_qa/dos_mcq_questions.json"),
    "Fuzzy": Path("Fuzzy_mcq_qa/fuzzy_mcq_questions.json"),
    "Gear":  Path("Gear_mcq_qa/gear_mcq_questions.json"),
    "RPM":   Path("RPM_mcq_qa/rpm_mcq_questions.json"),
}


MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
# MODEL_ID = "Qwen/Qwen3-4B-Thinking-2507" 
# MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

MODEL_TAG = MODEL_ID.split("/")[-1].replace(".", "_").replace("-", "_")

def mcq_answer_path(dataset_name: str, model_tag: str) -> Path:
    return Path(f"{dataset_name}_mcq_qa/{dataset_name.lower()}_mcq_answers_{model_tag}.jsonl")


In [None]:
def load_mcq_questions_map(path: Path):
    """
    return {qa_id: question_text}
    """
    if not path.exists():
        return {}
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)  # list[dict]
    qmap = {}
    for rec in data:
        qa_id = rec.get("qa_id")
        question = rec.get("question", "")
        if qa_id is not None:
            qmap[qa_id] = question
    return qmap


def load_mcq_answers(path: Path):
    records = []
    if not path.exists():
        return records
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records


In [None]:
def compute_stats(records):
    """
    records: list of dict with keys: question, pred, gold
    per-question :
      stats[question] = {
        "total": int,
        "correct": int,
        "pred": list[str],
        "gold": list[str],
        "labels": set[str],
      }
    """
    stats = defaultdict(lambda: {"total": 0, "correct": 0,
                                "pred": [], "gold": [], "labels": set()})
    for rec in records:
        q = rec["question"]
        pred = rec["pred"]
        gold = rec["gold"]
        stats[q]["total"] += 1
        stats[q]["correct"] += int(pred == gold)
        stats[q]["pred"].append(pred)
        stats[q]["gold"].append(gold)
        stats[q]["labels"].update([pred, gold])
    return stats


def f1_for_question(info):
    labels = list(info["labels"])
    if not labels:
        return 0.0
    if len(labels) == 1:
        return 1.0 if info["correct"] == info["total"] else 0.0
    encoder = LabelEncoder()
    encoder.fit(labels)
    y_true = encoder.transform(info["gold"])
    y_pred = encoder.transform(info["pred"])
    return f1_score(y_true, y_pred, average="macro", zero_division=0)


In [None]:
OUTPUT_DIR = Path("LLM_MCQ_Performce")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
out_csv = OUTPUT_DIR / f"Performance_MCQ_{MODEL_TAG}.csv"

header = ["Attack", "Question", "Total", "Correct", "Accuracy", "F1"]

all_dataset_records = []

with out_csv.open("w", encoding="utf-8", newline="") as f_out:
    writer = csv.writer(f_out)
    writer.writerow(header)

    for name in MCQ_DATASET_NAMES:
        q_path = MCQ_QUESTION_FILES[name]
        a_path = mcq_answer_path(name, MODEL_TAG)

        qmap = load_mcq_questions_map(q_path)
        ans_records = load_mcq_answers(a_path)

        if not qmap or not ans_records:
            print(f"[WARN] Skip {name}: questions or answers missing.")
            continue

        ds_records = []
        for rec in ans_records:
            qa_id = rec["qa_id"]
            q_text = qmap.get(qa_id, "")
            pred = str(rec.get("llm_answer", "")).strip().upper()
            gold = str(rec.get("ground_truth", "")).strip().upper()
            if pred not in {"A", "B", "C", "D"}:
                continue
            if gold not in {"A", "B", "C", "D"}:
                continue
            ds_records.append({"question": q_text, "pred": pred, "gold": gold})

        if not ds_records:
            print(f"[WARN] No valid MCQ records for {name}.")
            continue

        all_dataset_records.extend(ds_records)

        stats = compute_stats(ds_records)
        for question, info in stats.items():
            total = info["total"]       
            correct = info["correct"]  
            acc = correct / total if total else 0.0
            f1q = f1_for_question(info)
            writer.writerow([
                name,
                question,
                total,
                correct,
                f"{acc:.3f}",
                f"{f1q:.3f}",
            ])

        ds_total_pred = [r["pred"] for r in ds_records]
        ds_total_gold = [r["gold"] for r in ds_records]
        total = len(ds_records)                        
        correct = sum(int(p == g) for p, g in zip(ds_total_pred, ds_total_gold))
        acc = correct / total if total else 0.0
        if ds_total_pred:
            encoder = LabelEncoder()
            encoder.fit(ds_total_gold + ds_total_pred)
            y_true = encoder.transform(ds_total_gold)
            y_pred = encoder.transform(ds_total_pred)
            f1_ds = f1_score(y_true, y_pred, average="macro", zero_division=0)
        else:
            f1_ds = 0.0

        writer.writerow([
            f"{name}_total",
            "TOTAL",
            total,
            correct,
            f"{acc:.3f}",
            f"{f1_ds:.3f}",
        ])

    if all_dataset_records:
        combined_stats = compute_stats(all_dataset_records)
        for question, info in combined_stats.items():
            total = info["total"]        
            correct = info["correct"]    
            acc = correct / total if total else 0.0
            f1q = f1_for_question(info)
            writer.writerow([
                "Combined",          
                question,
                total,
                correct,
                f"{acc:.3f}",
                f"{f1q:.3f}",
            ])

        comb_pred = [r["pred"] for r in all_dataset_records]
        comb_gold = [r["gold"] for r in all_dataset_records]
        total = len(all_dataset_records)
        correct = sum(int(p == g) for p, g in zip(comb_pred, comb_gold))
        acc = correct / total if total else 0.0

        encoder = LabelEncoder()
        encoder.fit(comb_gold + comb_pred)
        y_true = encoder.transform(comb_gold)
        y_pred = encoder.transform(comb_pred)
        f1_comb = f1_score(y_true, y_pred, average="macro", zero_division=0)

        writer.writerow([
            "Combined_total",
            "TOTAL",
            total,
            correct,
            f"{acc:.3f}",
            f"{f1_comb:.3f}",
        ])


print(f"[INFO] MCQ performance saved to {out_csv}")


[INFO] MCQ performance saved to LLM_MCQ_Performce/Performance_MCQ_DeepSeek_R1_Distill_Llama_8B.csv
