From 22e6b4bc96cf7dfef36fe01b203405dec8b2566a Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 18 Oct 2025 07:12:11 +0000 Subject: [PATCH 1/6] new test struct --- tests/models/model_test.py | 292 ++++++++++++++---- tests/models/test_llama3_2.py | 25 +- .../tasks/mmlu/default/_default_template_yaml | 2 +- 3 files changed, 261 insertions(+), 58 deletions(-) diff --git a/tests/models/model_test.py b/tests/models/model_test.py index a6900a1c2..bf9f72036 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -23,6 +23,7 @@ # -- end do not touch from pathlib import Path # noqa: E402 +from enum import Enum # noqa: E402 from typing import Dict, List # noqa: E402 from logbar import LogBar # noqa: E402 @@ -70,13 +71,14 @@ def is_flash_attn_2_available(): # type: ignore log = LogBar.shared() +DEFAULT_FLOOR_PCT = 0.05 +DEFAULT_CEIL_PCT = 0.10 +DEFAULT_TASK_NAMES = (EVAL.LM_EVAL.ARC_CHALLENGE,) + + class ModelTest(unittest.TestCase): DEBUG = True # enable extra debug output - TASK_NAME = EVAL.LM_EVAL.ARC_CHALLENGE - # sub test can modify - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.15 # -15% - QUANT_ARC_MAX_POSITIVE_DELTA_CEIL_PERCENT = 1.0 # 200% TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = False TORCH_DTYPE = "auto" @@ -90,6 +92,7 @@ class ModelTest(unittest.TestCase): DATASET_SIZE = 256 DATASET_SORT = "asc" DELETE_QUANTIZED_MODEL = True + EVAL_TASKS = None KERNEL_QUANT = {} # kernel sets KERNEL_INFERENCE = {} # kernel sets @@ -126,6 +129,125 @@ class ModelTest(unittest.TestCase): {"prompt": "Name the largest ocean on Earth.", "keywords": ["pacific"]}, ] + def _normalize_task_identifier(self, task): + if isinstance(task, Enum): + return task.value + if task is None: + raise ValueError("Evaluation task identifier cannot be None") + return str(task) + + def _normalize_task_list(self): + task_specs = self.get_eval_tasks() + task_lookup = getattr(self, "_resolved_task_lookup", {}) + resolved_tasks = [] + if task_specs: + for normalized_name in task_specs.keys(): + original = task_lookup.get(normalized_name) + if original is None: + original = self._resolve_task_enum(normalized_name) + if isinstance(task_lookup, dict): + task_lookup[normalized_name] = original + resolved_tasks.append(original) + else: + resolved_tasks = list(DEFAULT_TASK_NAMES) + self._resolved_task_lookup = { + self._normalize_task_identifier(task): task for task in resolved_tasks + } + + normalized = [self._normalize_task_identifier(task) for task in resolved_tasks if task is not None] + if not normalized: + raise ValueError("No evaluation tasks configured") + return normalized + + def _resolve_task_enum(self, task): + if isinstance(task, Enum): + return task + if isinstance(task, str): + for enum_member in EVAL.get_task_enums(): + if task == enum_member.value or task == enum_member.name: + return enum_member + raise ValueError(f"Unknown evaluation task identifier: {task}") + + def _legacy_arc_tasks(self): + baselines = {} + arc_metrics = {} + if hasattr(self, "NATIVE_ARC_CHALLENGE_ACC"): + arc_metrics["acc"] = { + "value": self.NATIVE_ARC_CHALLENGE_ACC, + "floor_pct": DEFAULT_FLOOR_PCT, + "ceil_pct": DEFAULT_CEIL_PCT, + } + if hasattr(self, "NATIVE_ARC_CHALLENGE_ACC_NORM"): + arc_metrics["acc_norm"] = { + "value": self.NATIVE_ARC_CHALLENGE_ACC_NORM, + "floor_pct": DEFAULT_FLOOR_PCT, + "ceil_pct": DEFAULT_CEIL_PCT, + } + if arc_metrics: + normalized = self._normalize_task_identifier(EVAL.LM_EVAL.ARC_CHALLENGE) + baselines[normalized] = arc_metrics + lookup = getattr(self, "_resolved_task_lookup", None) + if isinstance(lookup, dict): + lookup[normalized] = EVAL.LM_EVAL.ARC_CHALLENGE + return baselines + + def _normalize_metric_spec(self, spec): + default_floor = DEFAULT_FLOOR_PCT + default_ceil = DEFAULT_CEIL_PCT + + if isinstance(spec, dict): + if "value" not in spec: + raise ValueError("Baseline metric dictionaries must include a `value` key.") + value = spec["value"] + floor_pct = spec.get("floor_pct", spec.get("max_delta_floor_percent", default_floor)) + ceil_pct = spec.get("ceil_pct", spec.get("max_delta_ceil_percent", default_ceil)) + metric_key = spec.get("metric_key") + else: + value = spec + floor_pct = default_floor + ceil_pct = default_ceil + metric_key = None + + if not isinstance(value, (int, float)): + raise TypeError(f"Baseline metric value must be numeric, got {type(value).__name__}") + if not isinstance(floor_pct, (int, float)): + raise TypeError(f"`floor_pct` must be numeric, got {type(floor_pct).__name__}") + if not isinstance(ceil_pct, (int, float)): + raise TypeError(f"`ceil_pct` must be numeric, got {type(ceil_pct).__name__}") + + return { + "value": float(value), + "floor_pct": float(floor_pct), + "ceil_pct": float(ceil_pct), + "metric_key": metric_key, + } + + def get_eval_tasks(self): + self._resolved_task_lookup = {} + if self.EVAL_TASKS: + baselines = {} + for task, metrics in self.EVAL_TASKS.items(): + resolved_task = self._resolve_task_enum(task) + normalized_task = self._normalize_task_identifier(resolved_task) + self._resolved_task_lookup[normalized_task] = resolved_task + baselines[normalized_task] = { + metric_name: self._normalize_metric_spec(spec) + for metric_name, spec in metrics.items() + } + return baselines + return self._legacy_arc_tasks() + + @staticmethod + def _flatten_task_metrics(task_results): + flat = {} + for task_name, metrics in task_results.items(): + if isinstance(metrics, dict): + for metric_name, value in metrics.items(): + flat[f"{task_name}:{metric_name}"] = value + else: + flat[task_name] = metrics + return flat + def assertInference(self, model, tokenizer=None, keywords=None, prompt=INFERENCE_PROMPT): # gptqmodel can auto init tokenizer internally @@ -219,7 +341,7 @@ def run_generic_inference_checks(self, model, tokenizer, backend): ) return results - def run_arc_challenge_eval(self, model, backend, trust_remote_code=False): + def run_eval_tasks(self, model, backend, trust_remote_code=False): previous_backend = self.LOAD_BACKEND self.LOAD_BACKEND = backend try: @@ -229,14 +351,14 @@ def run_arc_challenge_eval(self, model, backend, trust_remote_code=False): trust_remote_code=self.TRUST_REMOTE_CODE, delete_quantized_model=False, ) - log.info(f"[{backend.name}] ARC summary: {task_results}") + log.info(f"[{backend.name}] Evaluation summary: {task_results}") finally: self.LOAD_BACKEND = previous_backend return task_results def perform_post_quant_validation(self, model_path, trust_remote_code=False): inference_records = {} - arc_records = {} + eval_records = {} reuse_candidates = {} compare_backends = (BACKEND.MARLIN,) if self.FORMAT is FORMAT.GPTQ else (BACKEND.MARLIN, BACKEND.GEMM) @@ -266,7 +388,7 @@ def perform_post_quant_validation(self, model_path, trust_remote_code=False): should_reuse = can_reuse and backend == target_backend and not self.USE_VLLM try: - arc_records[backend] = self.run_arc_challenge_eval(model, backend, trust_remote_code=trust_remote_code) + eval_records[backend] = self.run_eval_tasks(model, backend, trust_remote_code=trust_remote_code) finally: if should_reuse: reuse_candidates[backend] = model @@ -275,9 +397,9 @@ def perform_post_quant_validation(self, model_path, trust_remote_code=False): torch_empty_cache() self.render_inference_summary(inference_records) - self.render_arc_summary(arc_records) + self.render_eval_summary(eval_records) - return reuse_candidates, arc_records + return reuse_candidates, eval_records @staticmethod def _human_size(num_bytes: int) -> str: @@ -420,27 +542,30 @@ def _format_inference_entry(self, entry): cell = f"{status} | {snippet}" if snippet else status return self._colorize(cell, matched) - def render_arc_summary(self, arc_records): - if not arc_records: + def render_eval_summary(self, eval_records): + if not eval_records: return - ordered_backends = [backend for backend in (BACKEND.MARLIN, BACKEND.TORCH) if backend in arc_records] + ordered_backends = [backend for backend in (BACKEND.MARLIN, BACKEND.TORCH) if backend in eval_records] if not ordered_backends: return - metrics = set() - for results in arc_records.values(): - metrics.update(results.keys()) - metrics = sorted(metrics) + flattened_records = { + backend: self._flatten_task_metrics(results) for backend, results in eval_records.items() + } + + metrics = sorted({metric for results in flattened_records.values() for metric in results.keys()}) table_rows = [] tolerance = 0.01 - torch_reference = arc_records.get(BACKEND.TORCH, {}) + torch_reference = flattened_records.get(BACKEND.TORCH, {}) for metric in metrics: - row = [metric] - reference_value = torch_reference.get(metric) + display_metric = metric.replace(":", " :: ") + row = [display_metric] + reference_value = None if torch_reference is None else torch_reference.get(metric) for backend in ordered_backends: - value = arc_records[backend].get(metric) + backend_values = flattened_records.get(backend, {}) + value = backend_values.get(metric) if value is None: row.append(self._colorize("N/A", False)) continue @@ -452,7 +577,7 @@ def render_arc_summary(self, arc_records): table_rows.append(row) headers = ["Metric"] + [backend.name for backend in ordered_backends] - log.info("ARC challenge comparison:\n%s", tabulate(table_rows, headers=headers, tablefmt="github")) + log.info("Evaluation comparison:\n%s", tabulate(table_rows, headers=headers, tablefmt="github")) def load_tokenizer(self, model_id_or_path, trust_remote_code=False): tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) @@ -576,7 +701,7 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, dtype="auto", ne ) tokenizer = model.tokenizer - self._post_quant_arc_records = {} + self._post_quant_eval_records = {} is_image_to_text_model = MODALITY.IMAGE_TO_TEXT in model.modality calibration_dataset = get_calib_dataset(model) if is_image_to_text_model else self.load_dataset(tokenizer, self.DATASET_SIZE) @@ -608,8 +733,8 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, dtype="auto", ne log.info(f"Quantized Model saved to tmp dir: {path}") target_backend = self.LOAD_BACKEND - reuse_candidates, arc_records = self.perform_post_quant_validation(path, trust_remote_code=trust_remote_code) - self._post_quant_arc_records = arc_records + reuse_candidates, eval_records = self.perform_post_quant_validation(path, trust_remote_code=trust_remote_code) + self._post_quant_eval_records = eval_records q_model = reuse_candidates.pop(target_backend, None) if q_model is None: @@ -695,6 +820,8 @@ def loadQuantModel(self, model_id_or_path, trust_remote_code=False, tokenizer_pa def lm_eval(self, model, apply_chat_template=False, trust_remote_code=False, delete_quantized_model=False, extra_args:dict=None): try: + task_names = self._normalize_task_list() + aggregated_results = {} with tempfile.TemporaryDirectory() as tmp_dir: model_path = getattr(model, "model_local_path", None) if isinstance(model, str): @@ -717,7 +844,7 @@ def lm_eval(self, model, apply_chat_template=False, trust_remote_code=False, del from lm_eval.tasks import TaskManager from lm_eval.utils import make_table - task_groups = EVAL.get_task_groups_from_tasks(self.TASK_NAME) + task_groups = EVAL.get_task_groups_from_tasks(task_names) for framework, tasks in task_groups.items(): log.info(f"TEST: EVAL starting: backend = {self.LOAD_BACKEND}") @@ -732,6 +859,17 @@ def lm_eval(self, model, apply_chat_template=False, trust_remote_code=False, del if eval_target is None: raise ValueError("Model evaluation target could not be determined.") + resolved_lookup = getattr(self, "_resolved_task_lookup", {}) + eval_tasks = [] + for task in tasks: + original_task = resolved_lookup.get(task) + if original_task is None: + original_task = self._resolve_task_enum(task) + if isinstance(resolved_lookup, dict): + normalized_task = self._normalize_task_identifier(original_task) + resolved_lookup[normalized_task] = original_task + eval_tasks.append(original_task) + results = GPTQModel.eval( model_or_id_or_path=eval_target, llm_backend="vllm" if self.USE_VLLM else "gptqmodel", @@ -739,7 +877,7 @@ def lm_eval(self, model, apply_chat_template=False, trust_remote_code=False, del output_path=tmp_dir, backend=self.LOAD_BACKEND, framework=framework, - tasks=tasks, + tasks=eval_tasks, apply_chat_template=apply_chat_template, trust_remote_code=trust_remote_code, batch_size=self.EVAL_BATCH_SIZE, @@ -753,18 +891,28 @@ def lm_eval(self, model, apply_chat_template=False, trust_remote_code=False, del if "groups" in results: print(make_table(results, "groups")) print('--------Eval Result End---------') - task_results = { - metric: value for metric, value in results['results'].get(self.TASK_NAME.value, {}).items() - if metric != 'alias' and 'stderr' not in metric - } - print(task_results) + for task_name in eval_tasks: + normalized_task_name = self._normalize_task_identifier(task_name) + metrics = results["results"].get(normalized_task_name, {}) + filtered_metrics = { + metric: value + for metric, value in metrics.items() + if metric != "alias" and "stderr" not in metric + } + aggregated_results[normalized_task_name] = filtered_metrics + print({normalized_task_name: filtered_metrics}) # only delete tmp folders - if delete_quantized_model and model.model_local_path.startswith("/tmp") and os.path.exists( - model.model_local_path): - log.info(f"Deleting temp model: {model.model_local_path}") - shutil.rmtree(model.model_local_path) - return task_results + model_local_path = getattr(model, "model_local_path", "") + if ( + delete_quantized_model + and isinstance(model_local_path, str) + and model_local_path.startswith("/tmp") + and os.path.exists(model_local_path) + ): + log.info(f"Deleting temp model: {model_local_path}") + shutil.rmtree(model_local_path) + return aggregated_results except BaseException as e: if isinstance(e, torch.OutOfMemoryError): old_batch = self.EVAL_BATCH_SIZE @@ -788,15 +936,9 @@ def lm_eval(self, model, apply_chat_template=False, trust_remote_code=False, del else: raise e - def calculatorPer(self, filter, value): - if "norm" in filter: - expected = self.NATIVE_ARC_CHALLENGE_ACC_NORM - else: - expected = self.NATIVE_ARC_CHALLENGE_ACC - + def calculatorPer(self, task_name, metric_name, value, expected): diff_pct = (value / expected) * 100 - log.info(f"{filter}: `{value}` vs `{expected}` diff {diff_pct:.2f}%") - + log.info(f"{task_name}:{metric_name}: `{value}` vs `{expected}` diff {diff_pct:.2f}%") return diff_pct, expected def quant_lm_eval(self): @@ -810,11 +952,11 @@ def quant_lm_eval(self): self.check_kernel(self.model, self.KERNEL_INFERENCE) - arc_records = getattr(self, "_post_quant_arc_records", {}) + eval_records = getattr(self, "_post_quant_eval_records", {}) target_backend = self.LOAD_BACKEND - if arc_records and len(arc_records) == 1 and target_backend in arc_records: - log.info("Reusing ARC results for backend `%s`; skipping duplicate lm_eval run", target_backend.name) - task_results = arc_records[target_backend] + if eval_records and len(eval_records) == 1 and target_backend in eval_records: + log.info("Reusing evaluation results for backend `%s`; skipping duplicate lm_eval run", target_backend.name) + task_results = eval_records[target_backend] else: task_results = self.lm_eval( model=self.SAVE_PATH if self.SAVE_PATH else self.model, @@ -825,11 +967,53 @@ def quant_lm_eval(self): self.check_results(task_results) def check_results(self, task_results): - for filter, value in task_results.items(): - diff_pct, expected = self.calculatorPer(filter=filter, value=value) - negative_pct = 100 * (1 - self.QUANT_ARC_MAX_DELTA_FLOOR_PERCENT) - positive_pct = 100 * (1 + self.QUANT_ARC_MAX_POSITIVE_DELTA_CEIL_PERCENT) - self.assertTrue(negative_pct <= diff_pct <= positive_pct, f"{filter}: `{value}` vs expected `{expected}`, diff {diff_pct:.2f}% is out of the expected range [{negative_pct}-{positive_pct}%]") + baselines = self.get_eval_tasks() + if not baselines: + raise AssertionError("No evaluation baselines configured for result validation.") + + for task_name, expected_metrics in baselines.items(): + metrics = task_results.get(task_name) + if metrics is None: + self.fail(f"No evaluation results returned for task `{task_name}`") + if not isinstance(metrics, dict): + raise TypeError(f"Expected metrics for task `{task_name}` to be a dictionary, got {type(metrics).__name__}") + + for metric_name, baseline_spec in expected_metrics.items(): + metric_key = baseline_spec.get("metric_key") or metric_name + metric_key = self._resolve_metric_key(metric_key, metrics) + if metric_key is None: + self.fail(f"Metric `{metric_name}` missing from results for task `{task_name}`") + + value = metrics[metric_key] + expected_value = baseline_spec["value"] + diff_pct, expected_value = self.calculatorPer( + task_name=task_name, + metric_name=metric_name, + value=value, + expected=expected_value, + ) + floor_pct = baseline_spec["floor_pct"] + ceil_pct = baseline_spec["ceil_pct"] + negative_pct = 100 * (1 - floor_pct) + positive_pct = 100 * (1 + ceil_pct) + self.assertTrue( + negative_pct <= diff_pct <= positive_pct, + f"{task_name}:{metric_name}: `{value}` vs expected `{expected_value}`, " + f"diff {diff_pct:.2f}% is out of the expected range [{negative_pct}-{positive_pct}%]", + ) + + @staticmethod + def _resolve_metric_key(metric_name, metrics): + if metric_name in metrics: + return metric_name + if metric_name is None: + return None + # if baseline uses canonical name without suffix, look for variants like acc,none + prefix = f"{metric_name}," + for key in metrics.keys(): + if key.startswith(prefix): + return key + return None def check_lm_head_loss(self, quant_log: List[Dict[str, any]]): final_log = quant_log[-1] diff --git a/tests/models/test_llama3_2.py b/tests/models/test_llama3_2.py index 195638167..3686ad9b6 100644 --- a/tests/models/test_llama3_2.py +++ b/tests/models/test_llama3_2.py @@ -4,6 +4,7 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL # a100:7, MARLIN kernel @@ -14,9 +15,27 @@ # desc_act = False, act_group_aware = True 0.3217/0.3643 class TestLlama3_2(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3268 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3558 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.04 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": { + "value": 0.3183, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + "acc_norm": { + "value": 0.3490, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + }, + EVAL.LM_EVAL.MMLU: { + "acc": { + "value": 0.3099, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + }, + } APPLY_CHAT_TEMPLATE = True V2 = False DEBUG = True diff --git a/tests/tasks/mmlu/default/_default_template_yaml b/tests/tasks/mmlu/default/_default_template_yaml index c97ebf293..e575f400c 100644 --- a/tests/tasks/mmlu/default/_default_template_yaml +++ b/tests/tasks/mmlu/default/_default_template_yaml @@ -1,4 +1,4 @@ -dataset_path: /monster/data/model/dataset/hails-mmlu_no_train # hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split +dataset_path: cais/mmlu test_split: test fewshot_split: dev fewshot_config: From 2756576740ea5658a5697e2ab701ab08900e4a23 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 18 Oct 2025 07:21:25 +0000 Subject: [PATCH 2/6] new test struct2 --- tests/models/test_act_group_aware.py | 10 +++++++--- tests/models/test_apertus.py | 10 +++++++--- tests/models/test_cohere.py | 10 +++++++--- tests/models/test_cohere2.py | 10 +++++++--- tests/models/test_deci.py | 10 +++++++--- tests/models/test_dream.py | 10 +++++++--- tests/models/test_falcon.py | 10 +++++++--- tests/models/test_glm.py | 11 +++++++---- tests/models/test_gpt_oss.py | 10 +++++++--- tests/models/test_granite.py | 10 +++++++--- tests/models/test_hymba.py | 10 +++++++--- tests/models/test_ling.py | 10 +++++++--- tests/models/test_llama3_2_awq.py | 10 +++++++--- tests/models/test_llama4.py | 10 +++++++--- tests/models/test_longllama.py | 10 +++++++--- tests/models/test_mimo.py | 10 +++++++--- tests/models/test_multi_vs_single_gpu.py | 10 +++++++--- tests/models/test_nemotron_ultra.py | 10 +++++++--- tests/models/test_qwen2_5.py | 10 +++++++--- tests/models/test_qwen2_5_omni.py | 11 +++++++---- tests/models/test_qwen2_5_vl.py | 10 +++++++--- tests/models/test_qwen2_moe_quant.py | 10 +++++++--- tests/models/test_qwen2_vl.py | 10 +++++++--- tests/models/test_qwen3_moe.py | 10 +++++++--- tests/models/test_qwen3_next.py | 10 +++++++--- tests/models/test_qwen3_omni.py | 10 +++++++--- tests/models/test_seed_oss.py | 10 +++++++--- tests/models/test_stablelm.py | 10 +++++++--- tests/models/test_xverse.py | 10 +++++++--- tests/test_asym_gptq_v1.py | 10 +++++++--- tests/test_bits_new.py | 9 ++++++--- tests/test_gptqv2.py | 10 +++++++--- tests/test_lm_head.py | 18 +++++++++++++----- tests/test_lora.py | 10 +++++++--- tests/test_quant_and_eora.py | 9 ++++++--- tests/test_quant_and_eora_transformers.py | 9 ++++++--- 36 files changed, 255 insertions(+), 112 deletions(-) diff --git a/tests/models/test_act_group_aware.py b/tests/models/test_act_group_aware.py index e568eb958..66e5009bc 100644 --- a/tests/models/test_act_group_aware.py +++ b/tests/models/test_act_group_aware.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestHybridActOrder(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3140 # A100 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3439 # A100 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.10 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3140, "floor_pct": 0.05}, + "acc_norm": {"value": 0.3439, "floor_pct": 0.05}, + }, + } APPLY_CHAT_TEMPLATE = True V2 = False ACT_GROUP_AWARE = True diff --git a/tests/models/test_apertus.py b/tests/models/test_apertus.py index fa675fb77..a609c8b8f 100644 --- a/tests/models/test_apertus.py +++ b/tests/models/test_apertus.py @@ -4,15 +4,19 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL from gptqmodel import BACKEND class TestApertus(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Apertus-8B-Instruct-2509/" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.5145 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5256 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.5145, "floor_pct": 0.2}, + "acc_norm": {"value": 0.5256, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_cohere.py b/tests/models/test_cohere.py index 9d6636a96..35261d42a 100644 --- a/tests/models/test_cohere.py +++ b/tests/models/test_cohere.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestCohere(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/aya-expanse-8b" # "CohereForAI/aya-expanse-8b" - NATIVE_ARC_CHALLENGE_ACC = 0.5401 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5640 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.20 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.5401, "floor_pct": 0.20}, + "acc_norm": {"value": 0.5640, "floor_pct": 0.20}, + }, + } EVAL_BATCH_SIZE = 4 def test_cohere(self): diff --git a/tests/models/test_cohere2.py b/tests/models/test_cohere2.py index 9c900e086..6b25bf6e6 100644 --- a/tests/models/test_cohere2.py +++ b/tests/models/test_cohere2.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestCohere2(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/c4ai-command-r7b-12-2024" - NATIVE_ARC_CHALLENGE_ACC = 0.4680 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4693 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.15 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.4680, "floor_pct": 0.15}, + "acc_norm": {"value": 0.4693, "floor_pct": 0.15}, + }, + } EVAL_BATCH_SIZE = 4 USE_FLASH_ATTN = False diff --git a/tests/models/test_deci.py b/tests/models/test_deci.py index cd66f029c..9fb81818f 100644 --- a/tests/models/test_deci.py +++ b/tests/models/test_deci.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestDeci(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/DeciLM-7B-instruct" # "Deci/DeciLM-7B-instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.5239 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5222 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.8 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.5239, "floor_pct": 0.8}, + "acc_norm": {"value": 0.5222, "floor_pct": 0.8}, + }, + } TRUST_REMOTE_CODE = True USE_VLLM = False EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_dream.py b/tests/models/test_dream.py index 393041862..bb053d1be 100644 --- a/tests/models/test_dream.py +++ b/tests/models/test_dream.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestDream(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Dream-v0-Instruct-7B" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } APPLY_CHAT_TEMPLATE = True TRUST_REMOTE_CODE = True EVAL_BATCH_SIZE = 1 diff --git a/tests/models/test_falcon.py b/tests/models/test_falcon.py index 7975e2bd6..99d696646 100644 --- a/tests/models/test_falcon.py +++ b/tests/models/test_falcon.py @@ -5,16 +5,20 @@ import torch # noqa: E402from tests.model_test import ModelTest from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestFalcon(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/falcon-7b-instruct" # "tiiuae/falcon-7b-instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3993 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4292 APPLY_CHAT_TEMPLATE = True TRUST_REMOTE_CODE = False TORCH_DTYPE = torch.float16 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.52 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3993, "floor_pct": 0.52}, + "acc_norm": {"value": 0.4292, "floor_pct": 0.52}, + }, + } EVAL_BATCH_SIZE = 6 USE_VLLM = False diff --git a/tests/models/test_glm.py b/tests/models/test_glm.py index c14202c92..76b429f06 100644 --- a/tests/models/test_glm.py +++ b/tests/models/test_glm.py @@ -4,16 +4,19 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestGlm(ModelTest): # real: THUDM/glm-4-9b-chat-hf NATIVE_MODEL_ID = "/monster/data/model/glm-4-9b-chat-hf" - NATIVE_ARC_CHALLENGE_ACC = 0.5154 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5316 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.5154, "floor_pct": 0.2}, + "acc_norm": {"value": 0.5316, "floor_pct": 0.2}, + }, + } USE_VLLM = False def test_glm(self): self.quant_lm_eval() - diff --git a/tests/models/test_gpt_oss.py b/tests/models/test_gpt_oss.py index 44befd520..38675d188 100644 --- a/tests/models/test_gpt_oss.py +++ b/tests/models/test_gpt_oss.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestGPTOSS(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/gpt-oss-20b-BF16/" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.4411 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4718 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.4411, "floor_pct": 0.2}, + "acc_norm": {"value": 0.4718, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = False EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_granite.py b/tests/models/test_granite.py index 83c9ff47c..4ea23c751 100644 --- a/tests/models/test_granite.py +++ b/tests/models/test_granite.py @@ -4,15 +4,19 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestGranite(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/granite-3.0-2b-instruct" # "ibm-granite/granite-3.0-2b-instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.4505 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4770 APPLY_CHAT_TEMPLATE = True TRUST_REMOTE_CODE = True - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.4505, "floor_pct": 0.2}, + "acc_norm": {"value": 0.4770, "floor_pct": 0.2}, + }, + } def test_granite(self): self.quant_lm_eval() diff --git a/tests/models/test_hymba.py b/tests/models/test_hymba.py index 0337a6b02..c829f7b0f 100644 --- a/tests/models/test_hymba.py +++ b/tests/models/test_hymba.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestHymba(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Hymba-1.5B-Instruct/" # "baichuan-inc/Baichuan2-7B-Chat" - NATIVE_ARC_CHALLENGE_ACC = 0.2073 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2713 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.75 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2073, "floor_pct": 0.75}, + "acc_norm": {"value": 0.2713, "floor_pct": 0.75}, + }, + } MODEL_MAX_LEN = 8192 TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True diff --git a/tests/models/test_ling.py b/tests/models/test_ling.py index d53fcef45..65e6650dd 100644 --- a/tests/models/test_ling.py +++ b/tests/models/test_ling.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestLing(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Ling-mini-2.0/" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.5009 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5137 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.5009, "floor_pct": 0.2}, + "acc_norm": {"value": 0.5137, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True # EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_llama3_2_awq.py b/tests/models/test_llama3_2_awq.py index e903c4bb9..4d7919158 100644 --- a/tests/models/test_llama3_2_awq.py +++ b/tests/models/test_llama3_2_awq.py @@ -4,6 +4,7 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL from gptqmodel.quantization import FORMAT, METHOD @@ -14,9 +15,12 @@ # desc_act = True, 0.3089/0.3328 class TestLlama3_2(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3234 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3524 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3234, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3524, "floor_pct": 0.36}, + }, + } APPLY_CHAT_TEMPLATE = True V2 = False DEBUG = True diff --git a/tests/models/test_llama4.py b/tests/models/test_llama4.py index 353dad96a..f04410d2a 100644 --- a/tests/models/test_llama4.py +++ b/tests/models/test_llama4.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestLlama4(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-4-Scout-17B-16E-Instruct" # "meta-llama/Llama-4-Scout-17B-16E-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } APPLY_CHAT_TEMPLATE = True TRUST_REMOTE_CODE = False diff --git a/tests/models/test_longllama.py b/tests/models/test_longllama.py index b06742e70..ea9992553 100644 --- a/tests/models/test_longllama.py +++ b/tests/models/test_longllama.py @@ -4,14 +4,18 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestLongLlama(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/long_llama_3b_instruct" # "syzymon/long_llama_3b_instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3515 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3652 TRUST_REMOTE_CODE = True - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.5 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3515, "floor_pct": 0.5}, + "acc_norm": {"value": 0.3652, "floor_pct": 0.5}, + }, + } USE_VLLM = False USE_FLASH_ATTN = False diff --git a/tests/models/test_mimo.py b/tests/models/test_mimo.py index e0a318f74..68f140aed 100644 --- a/tests/models/test_mimo.py +++ b/tests/models/test_mimo.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestMimo(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/MiMo-7B-RL" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2739 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3055 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2739, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3055, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_multi_vs_single_gpu.py b/tests/models/test_multi_vs_single_gpu.py index b0e0f94da..4be32736b 100644 --- a/tests/models/test_multi_vs_single_gpu.py +++ b/tests/models/test_multi_vs_single_gpu.py @@ -25,6 +25,7 @@ QUANT_LOG_NSAMPLES, ) from gptqmodel.quantization.config import QuantizeConfig +from gptqmodel.utils.eval import EVAL from gptqmodel.utils.torch import torch_empty_cache @@ -45,9 +46,12 @@ def _is_free_threaded() -> bool: class TestMultiVsSingleGPU(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3311 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3549 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.05 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3311, "floor_pct": 0.05}, + "acc_norm": {"value": 0.3549, "floor_pct": 0.05}, + }, + } APPLY_CHAT_TEMPLATE = True V2 = False DEBUG = True diff --git a/tests/models/test_nemotron_ultra.py b/tests/models/test_nemotron_ultra.py index 1e6a49c93..44eba9a54 100644 --- a/tests/models/test_nemotron_ultra.py +++ b/tests/models/test_nemotron_ultra.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestNemotronUltra(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3_1-Nemotron-Ultra-253B-v1" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } APPLY_CHAT_TEMPLATE = True TRUST_REMOTE_CODE = True diff --git a/tests/models/test_qwen2_5.py b/tests/models/test_qwen2_5.py index 6dc107995..c63f7ba75 100644 --- a/tests/models/test_qwen2_5.py +++ b/tests/models/test_qwen2_5.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestQwen2_5(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.05 - NATIVE_ARC_CHALLENGE_ACC = 0.2705 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3063 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2705, "floor_pct": 0.05}, + "acc_norm": {"value": 0.3063, "floor_pct": 0.05}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True #EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_qwen2_5_omni.py b/tests/models/test_qwen2_5_omni.py index e3d3a2844..26d5dca15 100644 --- a/tests/models/test_qwen2_5_omni.py +++ b/tests/models/test_qwen2_5_omni.py @@ -6,15 +6,19 @@ import soundfile as sf from model_test import ModelTest +from gptqmodel.utils.eval import EVAL from gptqmodel.models.definitions.qwen2_5_omni import Qwen2_5_OmniGPTQ class TestQwen2_5_Omni(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-Omni-3B" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2329 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2765 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2329, "floor_pct": 0.2}, + "acc_norm": {"value": 0.2765, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 @@ -86,4 +90,3 @@ def test_qwen2_5_omni(self): # delete audio file os.remove(audio_file_name) - diff --git a/tests/models/test_qwen2_5_vl.py b/tests/models/test_qwen2_5_vl.py index 035a3f8ac..75ba08f16 100644 --- a/tests/models/test_qwen2_5_vl.py +++ b/tests/models/test_qwen2_5_vl.py @@ -4,15 +4,19 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL from gptqmodel.models.definitions.qwen2_vl import Qwen2VLQModel class TestQwen2_VL(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-VL-3B-Instruct" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2329 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2765 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2329, "floor_pct": 0.2}, + "acc_norm": {"value": 0.2765, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_qwen2_moe_quant.py b/tests/models/test_qwen2_moe_quant.py index c217a89a2..c5ff402f2 100644 --- a/tests/models/test_qwen2_moe_quant.py +++ b/tests/models/test_qwen2_moe_quant.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestQwen2_5_Moe(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen1.5-MoE-A2.7B" # Qwen/Qwen1.5-MoE-A2.7B - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2739 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3055 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2739, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3055, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py index 119955df9..b6297f5fd 100644 --- a/tests/models/test_qwen2_vl.py +++ b/tests/models/test_qwen2_vl.py @@ -4,15 +4,19 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL from gptqmodel.models.definitions.qwen2_vl import Qwen2VLQModel class TestQwen2_VL(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2-VL-2B-Instruct" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2329 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2765 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2329, "floor_pct": 0.2}, + "acc_norm": {"value": 0.2765, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_qwen3_moe.py b/tests/models/test_qwen3_moe.py index 749235c1e..bf1d77c49 100644 --- a/tests/models/test_qwen3_moe.py +++ b/tests/models/test_qwen3_moe.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestQwen3Moe(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen3-30B-A3B" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.04 - NATIVE_ARC_CHALLENGE_ACC = 0.3788 # a100 4,5,6,7 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3899 # a100 4,5,6,7 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3788, "floor_pct": 0.04}, + "acc_norm": {"value": 0.3899, "floor_pct": 0.04}, + }, + } # TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True # EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_qwen3_next.py b/tests/models/test_qwen3_next.py index d25b8b5bd..923834440 100644 --- a/tests/models/test_qwen3_next.py +++ b/tests/models/test_qwen3_next.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestQwen3Next(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen3-Next-80B-A3B-Instruct" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.04 - NATIVE_ARC_CHALLENGE_ACC = 0.3900 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3900 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3900, "floor_pct": 0.04}, + "acc_norm": {"value": 0.3900, "floor_pct": 0.04}, + }, + } TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 4 diff --git a/tests/models/test_qwen3_omni.py b/tests/models/test_qwen3_omni.py index d386a424a..76e6a2881 100644 --- a/tests/models/test_qwen3_omni.py +++ b/tests/models/test_qwen3_omni.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestQwen3Omni(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen3-Omni-30B-A3B-Instruct/" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2739 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3055 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2739, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3055, "floor_pct": 0.2}, + }, + } # TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True # EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_seed_oss.py b/tests/models/test_seed_oss.py index cdaca1d19..ab2a0c3eb 100644 --- a/tests/models/test_seed_oss.py +++ b/tests/models/test_seed_oss.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestSeedOSS(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Seed-OSS-36B-Instruct/" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2739 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3055 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2739, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3055, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_stablelm.py b/tests/models/test_stablelm.py index 89b819023..ca4b2fac2 100644 --- a/tests/models/test_stablelm.py +++ b/tests/models/test_stablelm.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestStablelm(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/stablelm-base-alpha-3b" - NATIVE_ARC_CHALLENGE_ACC = 0.2363 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2577 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2363, "floor_pct": 0.2}, + "acc_norm": {"value": 0.2577, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/models/test_xverse.py b/tests/models/test_xverse.py index 6ee887def..a1de853f5 100644 --- a/tests/models/test_xverse.py +++ b/tests/models/test_xverse.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestXVerse(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/XVERSE-7B-Chat" # "xverse/XVERSE-7B-Chat" - NATIVE_ARC_CHALLENGE_ACC = 0.4198 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4044 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.4198, "floor_pct": 0.2}, + "acc_norm": {"value": 0.4044, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py index f58a045fd..692ddbb3b 100644 --- a/tests/test_asym_gptq_v1.py +++ b/tests/test_asym_gptq_v1.py @@ -12,13 +12,17 @@ from models.model_test import ModelTest # noqa: E402 from gptqmodel.quantization import FORMAT # noqa: E402 +from gptqmodel.utils.eval import EVAL # noqa: E402 class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } FORMAT = FORMAT.GPTQ SYM = False diff --git a/tests/test_bits_new.py b/tests/test_bits_new.py index 12b530660..40c9eb4f9 100644 --- a/tests/test_bits_new.py +++ b/tests/test_bits_new.py @@ -70,9 +70,12 @@ class Test(ModelTest): # NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-3B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } @classmethod def setUpClass(cls): diff --git a/tests/test_gptqv2.py b/tests/test_gptqv2.py index 654e731f5..ee96572d0 100644 --- a/tests/test_gptqv2.py +++ b/tests/test_gptqv2.py @@ -4,13 +4,17 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from models.model_test import ModelTest +from gptqmodel.utils.eval import EVAL class TestQwen2_5_GPTQv2(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.2739 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3055 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2739, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3055, "floor_pct": 0.2}, + }, + } TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True EVAL_BATCH_SIZE = 6 diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py index df7ee2119..3319a308d 100644 --- a/tests/test_lm_head.py +++ b/tests/test_lm_head.py @@ -13,6 +13,7 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # -- end do not touch from models.model_test import ModelTest # noqa: E402 +from gptqmodel.utils.eval import EVAL # noqa: E402 from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 @@ -21,9 +22,12 @@ class TestLmHeadLoad(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse" # "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse" DEVICE = "cuda:0" - NATIVE_ARC_CHALLENGE_ACC = 0.2799 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3046 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.2799, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3046, "floor_pct": 0.2}, + }, + } def test_load(self): model = GPTQModel.load(self.NATIVE_MODEL_ID, device=self.DEVICE) @@ -51,8 +55,12 @@ def setUpClass(cls): cls.calibration_dataset = [c[:cls.sample_length] for c in calibration_dataset] def test_quant_lm_head(self): - self.NATIVE_ARC_CHALLENGE_ACC = 0.3148464163822526 - self.NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3310580204778157 + self.EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3148464163822526, "floor_pct": 0.2}, + "acc_norm": {"value": 0.3310580204778157, "floor_pct": 0.2}, + }, + } quant_config = QuantizeConfig(bits=4, group_size=32, lm_head=True) diff --git a/tests/test_lora.py b/tests/test_lora.py index 5d74e6dd5..8e06bd50e 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -25,15 +25,19 @@ from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.adapter.adapter import Lora # noqa: E402 +from gptqmodel.utils.eval import EVAL # noqa: E402 class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128" lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc/blob/main/adapter_model.safetensors" #"sliuau/llama3.2-1b-4bit-group128-eora_test-rank128-arc" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } @classmethod def setUpClass(cls): diff --git a/tests/test_quant_and_eora.py b/tests/test_quant_and_eora.py index 92055bc89..0bd141f94 100644 --- a/tests/test_quant_and_eora.py +++ b/tests/test_quant_and_eora.py @@ -28,9 +28,12 @@ class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" - NATIVE_ARC_CHALLENGE_ACC = 0.3183 # Eora: 0.3234 -> A100 GPU 6 MARLIN KERNEL - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3404 # Eora: 0.3609 -> A100 GPU 6 MARLIN KERNEL - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3183, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3404, "floor_pct": 0.36}, + }, + } APPLY_CHAT_TEMPLATE = True V2 = False diff --git a/tests/test_quant_and_eora_transformers.py b/tests/test_quant_and_eora_transformers.py index a35a341b9..3bf333ae7 100644 --- a/tests/test_quant_and_eora_transformers.py +++ b/tests/test_quant_and_eora_transformers.py @@ -48,9 +48,12 @@ class Test(ModelTest): # NATIVE_MODEL_ID = "/monster/data/model/tinyllama-15M-stories" NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B" - NATIVE_ARC_CHALLENGE_ACC = 0.3567 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3805 - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.36 + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "acc": {"value": 0.3567, "floor_pct": 0.36}, + "acc_norm": {"value": 0.3805, "floor_pct": 0.36}, + }, + } @classmethod def setUpClass(cls): From 22e3d22239734f76c3c1b490088572d26f1ab93b Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 18 Oct 2025 08:09:53 +0000 Subject: [PATCH 3/6] update scores --- tests/models/test_qwen2_5.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/models/test_qwen2_5.py b/tests/models/test_qwen2_5.py index c63f7ba75..d4af6eb32 100644 --- a/tests/models/test_qwen2_5.py +++ b/tests/models/test_qwen2_5.py @@ -11,8 +11,11 @@ class TestQwen2_5(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct" EVAL_TASKS = { EVAL.LM_EVAL.ARC_CHALLENGE: { - "acc": {"value": 0.2705, "floor_pct": 0.05}, - "acc_norm": {"value": 0.3063, "floor_pct": 0.05}, + "acc": {"value": 0.2722, "floor_pct": 0.04}, + "acc_norm": {"value": 0.3072, "floor_pct": 0.04}, + }, + EVAL.LM_EVAL.MMLU: { + "acc": {"value": 0.4029, "floor_pct": 0.04}, }, } TRUST_REMOTE_CODE = False From fc6b3c232e50caf56f2eb237fd2d9c009ec34a6a Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 18 Oct 2025 09:12:41 +0000 Subject: [PATCH 4/6] update scores, use 512 rows for ci tests --- tests/models/model_test.py | 4 ++-- tests/models/test_glm.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/models/model_test.py b/tests/models/model_test.py index bf9f72036..3658a1212 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -89,8 +89,8 @@ class ModelTest(unittest.TestCase): USE_VLLM = False INPUTS_MAX_LENGTH = 2048 MODEL_MAX_LEN = 4096 - DATASET_SIZE = 256 - DATASET_SORT = "asc" + DATASET_SIZE = 512 + DATASET_SORT = "desc" DELETE_QUANTIZED_MODEL = True EVAL_TASKS = None diff --git a/tests/models/test_glm.py b/tests/models/test_glm.py index 76b429f06..295f234f1 100644 --- a/tests/models/test_glm.py +++ b/tests/models/test_glm.py @@ -6,17 +6,23 @@ from model_test import ModelTest from gptqmodel.utils.eval import EVAL - +# | Metric | MARLIN | +# |--------------------------------|----------| +# | arc_challenge :: acc,none | 0.5026 | +# | arc_challenge :: acc_norm,none | 0.5171 | +# | mmlu :: acc,none | 0.6362 | class TestGlm(ModelTest): # real: THUDM/glm-4-9b-chat-hf NATIVE_MODEL_ID = "/monster/data/model/glm-4-9b-chat-hf" EVAL_TASKS = { EVAL.LM_EVAL.ARC_CHALLENGE: { - "acc": {"value": 0.5154, "floor_pct": 0.2}, - "acc_norm": {"value": 0.5316, "floor_pct": 0.2}, + "acc": {"value": 0.5026, "floor_pct": 0.04}, + "acc_norm": {"value": 0.5171, "floor_pct": 0.04}, + }, + EVAL.LM_EVAL.MMLU: { + "acc": {"value": 0.6362, "floor_pct": 0.04}, }, } - USE_VLLM = False def test_glm(self): self.quant_lm_eval() From 969c3da0de37867495001abe4a1abea06eab8946 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 18 Oct 2025 10:52:39 +0000 Subject: [PATCH 5/6] simplify test --- tests/models/test_llama3_2.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/models/test_llama3_2.py b/tests/models/test_llama3_2.py index 3686ad9b6..5833808d8 100644 --- a/tests/models/test_llama3_2.py +++ b/tests/models/test_llama3_2.py @@ -13,38 +13,37 @@ # desc_act = True, REGRESSION 0.3191/0.3601 # a100:6+7: MARLIN kernel # desc_act = False, act_group_aware = True 0.3217/0.3643 +# | Metric | MARLIN | +# |--------------------------------|----------| +# | arc_challenge :: acc,none | 0.3174 | +# | arc_challenge :: acc_norm,none | 0.3601 | +# | mmlu :: acc,none | 0.3186 | class TestLlama3_2(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" EVAL_TASKS = { EVAL.LM_EVAL.ARC_CHALLENGE: { "acc": { - "value": 0.3183, + "value": 0.3174, "floor_pct": 0.04, "ceil_pct": 0.10, }, "acc_norm": { - "value": 0.3490, + "value": 0.3601, "floor_pct": 0.04, "ceil_pct": 0.10, }, }, EVAL.LM_EVAL.MMLU: { "acc": { - "value": 0.3099, + "value": 0.3186, "floor_pct": 0.04, "ceil_pct": 0.10, }, }, } APPLY_CHAT_TEMPLATE = True - V2 = False - DEBUG = True - ACT_GROUP_AWARE = True - DESC_ACT = False - DATASET_SIZE = 1024 - DATASET_SORT = "desc" QUANT_BATCH_SIZE = 4 - USE_FLASH_ATTN = True + # EORA = Lora( # # for quant, path is save path. for load, it is loading path # path="./eora_test", From 88717611f1264e956bf2031b75531bb9526e8b43 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 18 Oct 2025 11:20:43 +0000 Subject: [PATCH 6/6] update tests --- tests/models/test_qwen3_moe.py | 29 ++++++++++++++++------------- tests/test_quant_and_eora.py | 30 ++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/tests/models/test_qwen3_moe.py b/tests/models/test_qwen3_moe.py index bf1d77c49..d398b3c24 100644 --- a/tests/models/test_qwen3_moe.py +++ b/tests/models/test_qwen3_moe.py @@ -6,27 +6,30 @@ from model_test import ModelTest from gptqmodel.utils.eval import EVAL - +# | Metric | MARLIN | +# |--------------------------------|----------| +# | arc_challenge :: acc,none | 0.5094 | +# | arc_challenge :: acc_norm,none | 0.5486 | class TestQwen3Moe(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen3-30B-A3B" EVAL_TASKS = { EVAL.LM_EVAL.ARC_CHALLENGE: { - "acc": {"value": 0.3788, "floor_pct": 0.04}, - "acc_norm": {"value": 0.3899, "floor_pct": 0.04}, + "acc": {"value": 0.5094, "floor_pct": 0.04}, + "acc_norm": {"value": 0.5486, "floor_pct": 0.04}, }, } # TRUST_REMOTE_CODE = False - APPLY_CHAT_TEMPLATE = True + # APPLY_CHAT_TEMPLATE = True # EVAL_BATCH_SIZE = 6 - V2 = False - DEBUG = True - ACT_GROUP_AWARE = True - DESC_ACT = False - DATASET_SIZE = 1024 - DATASET_SORT = "desc" - QUANT_BATCH_SIZE = 4 - CALIB_NOISE_MODE = "unseen" - CALIB_NOISE_PERCENT = 0.025 + # V2 = False + # DEBUG = True + # ACT_GROUP_AWARE = True + # DESC_ACT = False + # DATASET_SIZE = 512 + # DATASET_SORT = "desc" + # QUANT_BATCH_SIZE = 4 + # CALIB_NOISE_MODE = "unseen" + # CALIB_NOISE_PERCENT = 0.025 def test_mimo(self): self.quant_lm_eval() diff --git a/tests/test_quant_and_eora.py b/tests/test_quant_and_eora.py index 0bd141f94..1f49550dc 100644 --- a/tests/test_quant_and_eora.py +++ b/tests/test_quant_and_eora.py @@ -24,26 +24,36 @@ from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 -# a100 gpu 7 +# --------Eval METHOD.GPTQ Result--------- +# | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +# |-------------|------:|------|-----:|--------|---|-----:|---|-----:| +# |arc_challenge| 1|none | 0|acc |↑ |0.3131|± |0.0136| +# | | |none | 0|acc_norm|↑ |0.3473|± |0.0139| +# +# --------Eval METHOD.GPTQ + EoRA Result--------- +# | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr| +# |-------------|------:|------|-----:|--------|---|-----:|---|-----:| +# |arc_challenge| 1|none | 0|acc |↑ |0.3140|± |0.0136| +# | | |none | 0|acc_norm|↑ |0.3567|± |0.0140| class Test(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct" EVAL_TASKS = { EVAL.LM_EVAL.ARC_CHALLENGE: { - "acc": {"value": 0.3183, "floor_pct": 0.36}, - "acc_norm": {"value": 0.3404, "floor_pct": 0.36}, + "acc": {"value": 0.3183, "floor_pct": 0.05}, + "acc_norm": {"value": 0.3404, "floor_pct": 0.05}, }, } APPLY_CHAT_TEMPLATE = True - V2 = False - DEBUG = True - ACT_GROUP_AWARE = True - DESC_ACT = False - DATASET_SIZE = 1024 - DATASET_SORT = "desc" QUANT_BATCH_SIZE = 4 - USE_FLASH_ATTN = True + # V2 = False + # DEBUG = True + # ACT_GROUP_AWARE = True + # DESC_ACT = False + # DATASET_SIZE = 512 + # DATASET_SORT = "desc" + # USE_FLASH_ATTN = True @classmethod def setUpClass(cls):