diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index b6d951480..bc88cdfd9 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -1323,6 +1323,9 @@ def _clone_model_init_kwargs(self, source: PreTrainedModel) -> Dict[str, Any]: return copy.deepcopy(kwargs) def reload_turtle_model(self) -> None: + if self.quantize_config.offload_to_disk is False: + return + if threading.current_thread() is not threading.main_thread(): raise RuntimeError("Turtle reloads must run on the main thread") diff --git a/gptqmodel/models/definitions/bailing_moe.py b/gptqmodel/models/definitions/bailing_moe.py index 7cba27d48..7d29e6e11 100644 --- a/gptqmodel/models/definitions/bailing_moe.py +++ b/gptqmodel/models/definitions/bailing_moe.py @@ -20,11 +20,11 @@ class BailingMoeQModel(BaseQModel): "#", { "input_layernorm": ("input_layernorm:!",), - "self_attn": ("query_key_value"), + "attention": ("query_key_value"), "post_attention_layernorm": ("post_attention_layernorm:!",), "mlp": { "gate": ("gate:!",), # <-- 0.5MB per layer. Not worth quantizing - "shared_expert": ("gate_proj", "up_proj", "down_proj"), + "shared_experts": ("gate_proj", "up_proj", "down_proj"), "experts": { "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), }, diff --git a/tests/models/model_test.py b/tests/models/model_test.py index 0626214be..fa0901353 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -212,7 +212,7 @@ def run_arc_challenge_eval(self, model, backend, trust_remote_code=False): task_results = self.lm_eval( model=model, apply_chat_template=self.APPLY_CHAT_TEMPLATE, - trust_remote_code=trust_remote_code, + trust_remote_code=self.TRUST_REMOTE_CODE, delete_quantized_model=False, ) log.info(f"[{backend.name}] ARC summary: {task_results}") diff --git a/tests/models/test_ling.py b/tests/models/test_ling.py new file mode 100644 index 000000000..d53fcef45 --- /dev/null +++ b/tests/models/test_ling.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +from model_test import ModelTest + + +class TestLing(ModelTest): + NATIVE_MODEL_ID = "/monster/data/model/Ling-mini-2.0/" + QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 + NATIVE_ARC_CHALLENGE_ACC = 0.5009 + NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5137 + TRUST_REMOTE_CODE = True + APPLY_CHAT_TEMPLATE = True + # EVAL_BATCH_SIZE = 6 + V2 = False + DEBUG = True + ACT_GROUP_AWARE = True + DESC_ACT = False + DATASET_SIZE = 2048 + DATASET_SORT = "desc" + QUANT_BATCH_SIZE = 8 + CALIB_NOISE_MODE = "unseen" + CALIB_NOISE_PERCENT = 0.025 + + def test_mimo(self): + self.quant_lm_eval()