diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index d8d705064..988ecdf8f 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -138,6 +138,7 @@ from .definitions.starcoder2 import Starcoder2QModel # noqa: E402 from .definitions.telechat2 import TeleChat2QModel from .definitions.xverse import XverseQModel # noqa: E402 +from .definitions.granitemoehybrid import GraniteMoeHybridQModel # make quants and inference more determinisitc @@ -217,6 +218,7 @@ "mllama": MLlamaQModel, "marin": Qwen3QModel, "granite": LlamaQModel, # 100% llama clone + "granitemoehybrid": GraniteMoeHybridQModel, "mobilellm": MobileLLMQModel, "hymba": HymbaQModel, "olmo2": LlamaQModel, # 100% llama clone diff --git a/gptqmodel/models/definitions/granitemoehybrid.py b/gptqmodel/models/definitions/granitemoehybrid.py new file mode 100644 index 000000000..94440c4b4 --- /dev/null +++ b/gptqmodel/models/definitions/granitemoehybrid.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +from ..base import BaseQModel + + +class GraniteMoeHybridQModel(BaseQModel): + pre_lm_head_norm_module = "model.norm" + + layer_modules_strict = False + + module_tree = [ + "model", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "mamba": ("in_proj:0", "out_proj:1"), + "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), + "shared_mlp": ("input_linear:0", "output_linear:1"), + "post_attention_layernorm": ("post_attention_layernorm:!",), + } + ] diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 9e51e6fc3..dd4d25a80 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -759,20 +759,26 @@ def pack_module( module_name=name, ): if effective_impl == "gpu": - module.pack_gpu( - linear=layer, - scales=q_scales, - zeros=q_zeros, - g_idx=q_g_idx, - device=target_device, - ) + try: + module.pack_gpu( + linear=layer, + scales=q_scales, + zeros=q_zeros, + g_idx=q_g_idx, + device=target_device, + ) + except ValueError: + module.pack_original(linear=layer, scales=q_scales, zeros=q_zeros, g_idx=q_g_idx) elif effective_impl == "block": - module.pack_block( - linear=layer, - scales=q_scales, - zeros=q_zeros, - g_idx=q_g_idx, - ) + try: + module.pack_block( + linear=layer, + scales=q_scales, + zeros=q_zeros, + g_idx=q_g_idx, + ) + except ValueError: + module.pack_original(linear=layer, scales=q_scales, zeros=q_zeros, g_idx=q_g_idx) else: module.pack_original(linear=layer, scales=q_scales, zeros=q_zeros, g_idx=q_g_idx) diff --git a/tests/models/model_test.py b/tests/models/model_test.py index 6ee99e2ad..db50e7c38 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -382,7 +382,13 @@ def perform_post_quant_validation(self, model_path, trust_remote_code=False): eval_records = {} reuse_candidates = {} - compare_backends = (BACKEND.MARLIN,) if self.FORMAT is FORMAT.GPTQ else (BACKEND.MARLIN, BACKEND.GEMM) + if self.FORMAT is FORMAT.GPTQ: + if self.LOAD_BACKEND == BACKEND.MARLIN: + compare_backends = (BACKEND.MARLIN,) + else: + compare_backends = (self.LOAD_BACKEND,) + else: + compare_backends = (BACKEND.MARLIN, BACKEND.GEMM) fallback_backend = None if BACKEND.MARLIN in compare_backends: try: diff --git a/tests/models/test_granite_4_0_h_1b.py b/tests/models/test_granite_4_0_h_1b.py new file mode 100644 index 000000000..b01d4924c --- /dev/null +++ b/tests/models/test_granite_4_0_h_1b.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium +from gptqmodel import BACKEND +from model_test import ModelTest + +from gptqmodel.utils.eval import EVAL + + +# a100:0, TORCH kernel +# desc_act = False, act_group_aware = True +# | Metric | MARLIN | +# |--------------------------------|----------| +# | arc_challenge :: acc,none | 0.3968 | +# | arc_challenge :: acc_norm,none | 0.4138 | +# | mmlu_stem :: acc,none | 0.4015 | +class Test_Granite_4_0_H_1B(ModelTest): + NATIVE_MODEL_ID = "/monster/data/model/granite-4.0-h-1b" # "ibm-granite/granite-4.0-h-1b" + GROUP_SIZE = 32 + EVAL_BATCH_SIZE = 1 + LOAD_BACKEND = BACKEND.TORCH + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "chat_template": True, + "acc": { + "value": 0.3968, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + "acc_norm": { + "value": 0.4138, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + }, + EVAL.LM_EVAL.MMLU_STEM: { + "chat_template": False, + "acc": { + "value": 0.4015, + "floor_pct": 0.1, + "ceil_pct": 0.20, + }, + }, + } + + def test_granite(self): + self.quant_lm_eval() diff --git a/tests/models/test_granite_4_0_h_350m.py b/tests/models/test_granite_4_0_h_350m.py new file mode 100644 index 000000000..75fb282fd --- /dev/null +++ b/tests/models/test_granite_4_0_h_350m.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium +from gptqmodel import BACKEND +from model_test import ModelTest + +from gptqmodel.utils.eval import EVAL + + +# a100:0, TORCH kernel +# desc_act = False, act_group_aware = True +# | Metric | MARLIN | +# |--------------------------------|----------| +# | arc_challenge :: acc,none | 0.3046 | +# | arc_challenge :: acc_norm,none | 0.3157 | +# | mmlu_stem :: acc,none | 0.2915 | +class Test_Granite_4_0_H_350M(ModelTest): + NATIVE_MODEL_ID = "/monster/data/model/granite-4.0-h-350m" # "ibm-granite/granite-4.0-h-350m" + GROUP_SIZE = 32 + EVAL_BATCH_SIZE = 16 + LOAD_BACKEND = BACKEND.TORCH + EVAL_TASKS = { + EVAL.LM_EVAL.ARC_CHALLENGE: { + "chat_template": True, + "acc": { + "value": 0.3046, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + "acc_norm": { + "value": 0.3157, + "floor_pct": 0.04, + "ceil_pct": 0.10, + }, + }, + EVAL.LM_EVAL.MMLU_STEM: { + "chat_template": False, + "acc": { + "value": 0.2915, + "floor_pct": 0.1, + "ceil_pct": 0.20, + }, + }, + } + + def test_granite(self): + self.quant_lm_eval()