In [None]:
import os
import json
import random
import gc
from datetime import datetime
from tqdm import tqdm
from llama_cpp import Llama
from datasets import load_dataset
import pandas as pd

# ========== Task Configurations ==========
TASKS = {
    "BoolQ": {
        "dataset": "boolq",
        "split": "validation",
        "answer_type": "bool"
    },
    "PIQA": {
        "dataset": "piqa",
        "split": "validation",
        "answer_type": "mc",
        "choices": ["A", "B"]
    },
    "OpenBookQA": {
        "dataset": "openbookqa",
        "split": "test",
        "subset": "main",
        "answer_type": "mc4",
        "choices": ["A", "B", "C", "D"]
    }
}

# ========== Parameters ==========
N = 1
SEED = 42
random.seed(SEED)

models_dir = "./models"
model_files = [f for f in os.listdir(models_dir) if f.endswith(".gguf")]
results = []

# ========== Text Normalization ==========
def normalize(text):
    return text.strip().lower()

# ========== Evaluate Each Model ==========
for model_file in model_files:
    model_path = os.path.join(models_dir, model_file)
    print(f"\n=== Evaluating model: {model_file} ===")

    model = Llama(
        model_path=model_path,
        n_ctx=2048,
        n_threads=8,
        n_gpu_layers=-1
    )

    model_result = {"model": model_file, "tasks": {}}

    for task_name, config in TASKS.items():
        print(f"--- Task: {task_name} ---")

        if "subset" in config:
            dataset = load_dataset(config["dataset"], config["subset"], split=config["split"])
        else:
            dataset = load_dataset(config["dataset"], split=config["split"])

        samples = random.sample(list(dataset), N)
        correct = 0

        for sample in tqdm(samples, desc=f"Running {task_name}"):
            # Prompt construction and label
            if config["answer_type"] == "bool":
                question = sample['question']
                passage = sample['passage']
                answer = sample['answer']
                prompt = f"""Question: {question}

Passage: {passage}

Please answer TRUE or FALSE.
Answer:"""
                gold = "true" if answer else "false"

            elif config["dataset"] == "piqa":
                q = sample['goal']
                a1 = sample['sol1']
                a2 = sample['sol2']
                prompt = f"""Choose the best option for the physical question below.

Question: {q}

A: {a1}
B: {a2}

Which option is better? Answer with A or B.
Answer:"""
                gold = "a" if sample['label'] == 0 else "b"

            elif config["dataset"] == "openbookqa":
                question = sample["question_stem"]
                choices = sample["choices"]
                gold = sample["answerKey"].lower()

                prompt = f"""You are answering a multiple choice science question. Choose the correct option: A, B, C, or D.

Question: {question}

"""
                for label, text in zip(choices["label"], choices["text"]):
                    prompt += f"{label}: {text}\n"

                prompt += "\nAnswer:"

            else:
                continue

            # Inference
            response = model(
                prompt,
                max_tokens=20,
                temperature=0.7,
                stop=["\n"]
            )["choices"][0]["text"].strip().lower()

            # Prediction and comparison
            if config["answer_type"] == "bool":
                pred = "true" if "true" in response else "false" if "false" in response else ""
            elif config["answer_type"] in ["mc", "mc4"]:
                pred = None
                for opt in ['a', 'b', 'c', 'd', 'e']:
                    if opt in response:
                        pred = opt
                        break
            else:
                pred = ""

            if pred == gold:
                correct += 1

        acc = correct / N
        model_result["tasks"][task_name] = {
            "accuracy": acc,
            "correct": correct,
            "total": N
        }
        print(f"[{task_name}] Accuracy: {acc * 100:.2f}% ({correct}/{N})")

    results.append(model_result)

    # Cleanup
    del model
    gc.collect()

# ========== Save Results (Pivot Format) ==========
save_dir = "./eval"
os.makedirs(save_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"eval_results_{N}samples_{timestamp}.csv"
output_path = os.path.join(save_dir, filename)

# Flatten all results into rows for pivot
rows = []
for model_result in results:
    model_name = model_result["model"]
    for task, result in model_result["tasks"].items():
        rows.append({
            "Model": model_name,
            "Task": task,
            "Accuracy": round(result["accuracy"] * 100, 2)
        })

df = pd.DataFrame(rows)

# Pivot to "Model × Task" format
pivot_df = df.pivot(index="Model", columns="Task", values="Accuracy")

# Compute average accuracy per model
pivot_df["Avg"] = pivot_df.mean(axis=1).round(2)

# Optional: Sort by Avg
pivot_df = pivot_df.sort_values(by="Avg", ascending=False)

# Save to CSV
pivot_df.to_csv(output_path)
print(f"\n✅ Combined results saved to: {output_path}")



llama_model_loader: loaded meta data with 32 key-value pairs and 292 tensors from ./models/DeepSeek-R1-Distill-Llama-8B-Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Llama 8B
llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Llama
llama_model_loader: - kv   4:                         general.size_label str              = 8B
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                       llama.context_length u32              = 131072
llama_model_loader: -


=== Evaluating model: DeepSeek-R1-Distill-Llama-8B-Q4_0.gguf ===


init_tokenizer: initializing tokenizer for type 2
load: control token: 128254 '<|reserved_special_token_246|>' is not marked as EOG
load: control token: 128249 '<|reserved_special_token_241|>' is not marked as EOG
load: control token: 128246 '<|reserved_special_token_238|>' is not marked as EOG
load: control token: 128243 '<|reserved_special_token_235|>' is not marked as EOG
load: control token: 128242 '<|reserved_special_token_234|>' is not marked as EOG
load: control token: 128241 '<|reserved_special_token_233|>' is not marked as EOG
load: control token: 128240 '<|reserved_special_token_232|>' is not marked as EOG
load: control token: 128235 '<|reserved_special_token_227|>' is not marked as EOG
load: control token: 128231 '<|reserved_special_token_223|>' is not marked as EOG
load: control token: 128230 '<|reserved_special_token_222|>' is not marked as EOG
load: control token: 128228 '<|reserved_special_token_220|>' is not marked as EOG
load: control token: 128225 '<|reserved_special_

--- Task: BoolQ ---


Running BoolQ:   0%|          | 0/1 [00:00<?, ?it/s]llama_perf_context_print:        load time =    1845.89 ms
llama_perf_context_print: prompt eval time =    1845.75 ms /    63 tokens (   29.30 ms per token,    34.13 tokens per second)
llama_perf_context_print:        eval time =     111.60 ms /     1 runs   (  111.60 ms per token,     8.96 tokens per second)
llama_perf_context_print:       total time =    1958.60 ms /    64 tokens
Running BoolQ: 100%|██████████| 1/1 [00:01<00:00,  1.96s/it]


[BoolQ] Accuracy: 100.00% (1/1)
--- Task: PIQA ---


Running PIQA:   0%|          | 0/1 [00:00<?, ?it/s]Llama.generate: 1 prefix-match hit, remaining 50 prompt tokens to eval
llama_perf_context_print:        load time =    1845.89 ms
llama_perf_context_print: prompt eval time =    1425.33 ms /    50 tokens (   28.51 ms per token,    35.08 tokens per second)
llama_perf_context_print:        eval time =     107.06 ms /     1 runs   (  107.06 ms per token,     9.34 tokens per second)
llama_perf_context_print:       total time =    1533.44 ms /    51 tokens
Running PIQA: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


[PIQA] Accuracy: 100.00% (1/1)


llama_model_loader: loaded meta data with 27 key-value pairs and 579 tensors from ./models/DeepSeek-R1-Distill-Qwen-14B-Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Qwen 14B
llama_model_loader: - kv   3:                       general.organization str              = Deepseek Ai
llama_model_loader: - kv   4:                           general.basename str              = DeepSeek-R1-Distill-Qwen
llama_model_loader: - kv   5:                         general.size_label str              = 14B
llama_model_loader: - kv   6:                          qwen2.block_count u32              = 48
llama_model_load


=== Evaluating model: DeepSeek-R1-Distill-Qwen-14B-Q8_0.gguf ===


print_info: arch             = qwen2
print_info: vocab_only       = 0
print_info: n_ctx_train      = 131072
print_info: n_embd           = 5120
print_info: n_layer          = 48
print_info: n_head           = 40
print_info: n_head_kv        = 8
print_info: n_rot            = 128
print_info: n_swa            = 0
print_info: n_embd_head_k    = 128
print_info: n_embd_head_v    = 128
print_info: n_gqa            = 5
print_info: n_embd_k_gqa     = 1024
print_info: n_embd_v_gqa     = 1024
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-05
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: f_attn_scale     = 0.0e+00
print_info: n_ff             = 13824
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 2
print_info: rope scaling     = linear
print_info: freq_base_train  = 1000000

--- Task: BoolQ ---


Running BoolQ:   0%|          | 0/1 [00:00<?, ?it/s]llama_perf_context_print:        load time =   20226.14 ms
llama_perf_context_print: prompt eval time =   20225.83 ms /   337 tokens (   60.02 ms per token,    16.66 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   20226.60 ms /   338 tokens
Running BoolQ: 100%|██████████| 1/1 [00:20<00:00, 20.23s/it]


[BoolQ] Accuracy: 0.00% (0/1)
--- Task: PIQA ---


Running PIQA:   0%|          | 0/1 [00:00<?, ?it/s]Llama.generate: 1 prefix-match hit, remaining 59 prompt tokens to eval
llama_perf_context_print:        load time =   20226.14 ms
llama_perf_context_print: prompt eval time =    3734.67 ms /    59 tokens (   63.30 ms per token,    15.80 tokens per second)
llama_perf_context_print:        eval time =     352.94 ms /     1 runs   (  352.94 ms per token,     2.83 tokens per second)
llama_perf_context_print:       total time =    4088.79 ms /    60 tokens
Running PIQA: 100%|██████████| 1/1 [00:04<00:00,  4.09s/it]


[PIQA] Accuracy: 0.00% (0/1)


llama_model_loader: loaded meta data with 30 key-value pairs and 579 tensors from ./models/DeepSeek-R1-Distill-Qwen-14B-Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Qwen 14B
llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Qwen
llama_model_loader: - kv   4:                         general.size_label str              = 14B
llama_model_loader: - kv   5:                          qwen2.block_count u32              = 48
llama_model_loader: - kv   6:                       qwen2.context_length u32              = 131072
llama_model_loader: -


=== Evaluating model: DeepSeek-R1-Distill-Qwen-14B-Q4_0.gguf ===


print_info: f_attn_scale     = 0.0e+00
print_info: n_ff             = 13824
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 2
print_info: rope scaling     = linear
print_info: freq_base_train  = 1000000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn  = 131072
print_info: rope_finetuned   = unknown
print_info: ssm_d_conv       = 0
print_info: ssm_d_inner      = 0
print_info: ssm_d_state      = 0
print_info: ssm_dt_rank      = 0
print_info: ssm_dt_b_c_rms   = 0
print_info: model type       = 14B
print_info: model params     = 14.77 B
print_info: general.name     = DeepSeek R1 Distill Qwen 14B
print_info: vocab type       = BPE
print_info: n_vocab          = 152064
print_info: n_merges         = 151387
print_info: BOS token        = 151646 '<｜begin▁of▁sentence｜>'
print_info: EOS token        = 151643 '<｜end▁of▁sentence｜>'
print_info: EOT token        = 151643 

--- Task: BoolQ ---


Running BoolQ:   0%|          | 0/1 [00:00<?, ?it/s]llama_perf_context_print:        load time =   10651.73 ms
llama_perf_context_print: prompt eval time =   10651.38 ms /   164 tokens (   64.95 ms per token,    15.40 tokens per second)
llama_perf_context_print:        eval time =     224.04 ms /     1 runs   (  224.04 ms per token,     4.46 tokens per second)
llama_perf_context_print:       total time =   10876.82 ms /   165 tokens
Running BoolQ: 100%|██████████| 1/1 [00:10<00:00, 10.88s/it]


[BoolQ] Accuracy: 0.00% (0/1)
--- Task: PIQA ---


Running PIQA:   0%|          | 0/1 [00:00<?, ?it/s]Llama.generate: 1 prefix-match hit, remaining 145 prompt tokens to eval
llama_perf_context_print:        load time =   10651.73 ms
llama_perf_context_print: prompt eval time =    7784.17 ms /   145 tokens (   53.68 ms per token,    18.63 tokens per second)
llama_perf_context_print:        eval time =     214.75 ms /     1 runs   (  214.75 ms per token,     4.66 tokens per second)
llama_perf_context_print:       total time =    8000.21 ms /   146 tokens
Running PIQA: 100%|██████████| 1/1 [00:08<00:00,  8.00s/it]


[PIQA] Accuracy: 100.00% (1/1)


llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from ./models/Meta-Llama-3-8B-Instruct.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = dl
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 128256
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                          llama.block_count u32              = 32
llama_model_loader: - kv   6:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   7:                 llama.rope.dimension_cou


=== Evaluating model: Meta-Llama-3-8B-Instruct.Q4_0.gguf ===


load: missing pre-tokenizer type, using: 'default'
load:                                             
load: ************************************        
load: GENERATION QUALITY WILL BE DEGRADED!        
load: CONSIDER REGENERATING THE MODEL             
load: ************************************        
load:                                             
init_tokenizer: initializing tokenizer for type 2
load: control token: 128255 '<|reserved_special_token_250|>' is not marked as EOG
load: control token: 128254 '<|reserved_special_token_249|>' is not marked as EOG
load: control token: 128253 '<|reserved_special_token_248|>' is not marked as EOG
load: control token: 128251 '<|reserved_special_token_246|>' is not marked as EOG
load: control token: 128246 '<|reserved_special_token_241|>' is not marked as EOG
load: control token: 128243 '<|reserved_special_token_238|>' is not marked as EOG
load: control token: 128240 '<|reserved_special_token_235|>' is not marked as EOG
load: control token

--- Task: BoolQ ---


Running BoolQ:   0%|          | 0/1 [00:00<?, ?it/s]llama_perf_context_print:        load time =    3999.87 ms
llama_perf_context_print: prompt eval time =    3999.68 ms /   151 tokens (   26.49 ms per token,    37.75 tokens per second)
llama_perf_context_print:        eval time =     116.70 ms /     1 runs   (  116.70 ms per token,     8.57 tokens per second)
llama_perf_context_print:       total time =    4117.77 ms /   152 tokens
Running BoolQ: 100%|██████████| 1/1 [00:04<00:00,  4.12s/it]


[BoolQ] Accuracy: 0.00% (0/1)
--- Task: PIQA ---


Running PIQA:   0%|          | 0/1 [00:00<?, ?it/s]Llama.generate: 1 prefix-match hit, remaining 62 prompt tokens to eval
llama_perf_context_print:        load time =    3999.87 ms
llama_perf_context_print: prompt eval time =    2483.99 ms /    62 tokens (   40.06 ms per token,    24.96 tokens per second)
llama_perf_context_print:        eval time =    2173.17 ms /    19 runs   (  114.38 ms per token,     8.74 tokens per second)
llama_perf_context_print:       total time =    4665.45 ms /    81 tokens
Running PIQA: 100%|██████████| 1/1 [00:04<00:00,  4.67s/it]


[PIQA] Accuracy: 0.00% (0/1)


llama_model_loader: loaded meta data with 27 key-value pairs and 339 tensors from ./models/DeepSeek-R1-Distill-Qwen-7B-Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Qwen 7B
llama_model_loader: - kv   3:                       general.organization str              = Deepseek Ai
llama_model_loader: - kv   4:                           general.basename str              = DeepSeek-R1-Distill-Qwen
llama_model_loader: - kv   5:                         general.size_label str              = 7B
llama_model_loader: - kv   6:                          qwen2.block_count u32              = 28
llama_model_loader:


=== Evaluating model: DeepSeek-R1-Distill-Qwen-7B-Q8_0.gguf ===


load: token to piece cache size = 0.9310 MB
print_info: arch             = qwen2
print_info: vocab_only       = 0
print_info: n_ctx_train      = 131072
print_info: n_embd           = 3584
print_info: n_layer          = 28
print_info: n_head           = 28
print_info: n_head_kv        = 4
print_info: n_rot            = 128
print_info: n_swa            = 0
print_info: n_embd_head_k    = 128
print_info: n_embd_head_v    = 128
print_info: n_gqa            = 7
print_info: n_embd_k_gqa     = 512
print_info: n_embd_v_gqa     = 512
print_info: f_norm_eps       = 0.0e+00
print_info: f_norm_rms_eps   = 1.0e-06
print_info: f_clamp_kqv      = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale    = 0.0e+00
print_info: f_attn_scale     = 0.0e+00
print_info: n_ff             = 18944
print_info: n_expert         = 0
print_info: n_expert_used    = 0
print_info: causal attn      = 1
print_info: pooling type     = 0
print_info: rope type        = 2
print_info: rope scaling     = lin

--- Task: BoolQ ---


Running BoolQ:   0%|          | 0/1 [00:00<?, ?it/s]llama_perf_context_print:        load time =    8415.10 ms
llama_perf_context_print: prompt eval time =    8414.74 ms /   254 tokens (   33.13 ms per token,    30.19 tokens per second)
llama_perf_context_print:        eval time =     188.51 ms /     1 runs   (  188.51 ms per token,     5.30 tokens per second)
llama_perf_context_print:       total time =    8604.58 ms /   255 tokens
Running BoolQ: 100%|██████████| 1/1 [00:08<00:00,  8.61s/it]


[BoolQ] Accuracy: 100.00% (1/1)
--- Task: PIQA ---


Running PIQA:   0%|          | 0/1 [00:00<?, ?it/s]Llama.generate: 1 prefix-match hit, remaining 161 prompt tokens to eval
llama_perf_context_print:        load time =    8415.10 ms
llama_perf_context_print: prompt eval time =    5333.37 ms /   161 tokens (   33.13 ms per token,    30.19 tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    5334.16 ms /   162 tokens
Running PIQA: 100%|██████████| 1/1 [00:05<00:00,  5.34s/it]


[PIQA] Accuracy: 0.00% (0/1)


llama_model_loader: loaded meta data with 27 key-value pairs and 339 tensors from ./models/DeepSeek-R1-Distill-Qwen-7B-Q4_0_tool_tuning.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Qwen 7B
llama_model_loader: - kv   3:                       general.organization str              = Deepseek Ai
llama_model_loader: - kv   4:                           general.basename str              = DeepSeek-R1-Distill-Qwen
llama_model_loader: - kv   5:                         general.size_label str              = 7B
llama_model_loader: - kv   6:                          qwen2.block_count u32              = 28
llama_m


=== Evaluating model: DeepSeek-R1-Distill-Qwen-7B-Q4_0_tool_tuning.gguf ===


print_info: EOG token        = 151662 '<|fim_pad|>'
print_info: EOG token        = 151663 '<|repo_name|>'
print_info: EOG token        = 151664 '<|file_sep|>'
print_info: max token length = 256
load_tensors: loading model tensors, this can take a while... (mmap = true)
load_tensors: layer   0 assigned to device CPU
load_tensors: layer   1 assigned to device CPU
load_tensors: layer   2 assigned to device CPU
load_tensors: layer   3 assigned to device CPU
load_tensors: layer   4 assigned to device CPU
load_tensors: layer   5 assigned to device CPU
load_tensors: layer   6 assigned to device CPU
load_tensors: layer   7 assigned to device CPU
load_tensors: layer   8 assigned to device CPU
load_tensors: layer   9 assigned to device CPU
load_tensors: layer  10 assigned to device CPU
load_tensors: layer  11 assigned to device CPU
load_tensors: layer  12 assigned to device CPU
load_tensors: layer  13 assigned to device CPU
load_tensors: layer  14 assigned to device CPU
load_tensors: layer  15 a

--- Task: BoolQ ---


Running BoolQ:   0%|          | 0/1 [00:00<?, ?it/s]llama_perf_context_print:        load time =    8403.87 ms
llama_perf_context_print: prompt eval time =    8403.66 ms /   201 tokens (   41.81 ms per token,    23.92 tokens per second)
llama_perf_context_print:        eval time =    2175.45 ms /    19 runs   (  114.50 ms per token,     8.73 tokens per second)
llama_perf_context_print:       total time =   10589.21 ms /   220 tokens
Running BoolQ: 100%|██████████| 1/1 [00:10<00:00, 10.59s/it]


[BoolQ] Accuracy: 0.00% (0/1)
--- Task: PIQA ---


Running PIQA:   0%|          | 0/1 [00:00<?, ?it/s]Llama.generate: 1 prefix-match hit, remaining 87 prompt tokens to eval
llama_perf_context_print:        load time =    8403.87 ms
llama_perf_context_print: prompt eval time =    3432.02 ms /    87 tokens (   39.45 ms per token,    25.35 tokens per second)
llama_perf_context_print:        eval time =     115.09 ms /     1 runs   (  115.09 ms per token,     8.69 tokens per second)
llama_perf_context_print:       total time =    3548.28 ms /    88 tokens
Running PIQA: 100%|██████████| 1/1 [00:03<00:00,  3.55s/it]


[PIQA] Accuracy: 100.00% (1/1)

✅ Combined results saved to: ./eval/eval_results_1samples_20250331_194130.csv
