ModelCloud · Qubitium · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -138,6 +138,7 @@
 from .definitions.starcoder2 import Starcoder2QModel  # noqa: E402
 from .definitions.telechat2 import TeleChat2QModel
 from .definitions.xverse import XverseQModel  # noqa: E402
+from .definitions.granitemoehybrid import GraniteMoeHybridQModel
 
 
 # make quants and inference more determinisitc
@@ -217,6 +218,7 @@
     "mllama": MLlamaQModel,
     "marin": Qwen3QModel,
     "granite": LlamaQModel, # 100% llama clone
+    "granitemoehybrid": GraniteMoeHybridQModel,
     "mobilellm": MobileLLMQModel,
     "hymba": HymbaQModel,
     "olmo2": LlamaQModel, # 100% llama clone

diff --git a/gptqmodel/models/definitions/granitemoehybrid.py b/gptqmodel/models/definitions/granitemoehybrid.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from ..base import BaseQModel
+
+
+class GraniteMoeHybridQModel(BaseQModel):
+    pre_lm_head_norm_module = "model.norm"
+
+    layer_modules_strict = False
+
+    module_tree = [
+        "model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "mamba": ("in_proj:0", "out_proj:1"),
+            "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
+            "shared_mlp": ("input_linear:0", "output_linear:1"),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+        }
+    ]
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -759,20 +759,26 @@ def pack_module(
             module_name=name,
         ):
             if effective_impl == "gpu":
-                module.pack_gpu(
-                    linear=layer,
-                    scales=q_scales,
-                    zeros=q_zeros,
-                    g_idx=q_g_idx,
-                    device=target_device,
-                )
+                try:
+                    module.pack_gpu(
+                        linear=layer,
+                        scales=q_scales,
+                        zeros=q_zeros,
+                        g_idx=q_g_idx,
+                        device=target_device,
+                    )
+                except ValueError:
+                    module.pack_original(linear=layer, scales=q_scales, zeros=q_zeros, g_idx=q_g_idx)
             elif effective_impl == "block":
-                module.pack_block(
-                    linear=layer,
-                    scales=q_scales,
-                    zeros=q_zeros,
-                    g_idx=q_g_idx,
-                )
+                try:
+                    module.pack_block(
+                        linear=layer,
+                        scales=q_scales,
+                        zeros=q_zeros,
+                        g_idx=q_g_idx,
+                    )
+                except ValueError:
+                    module.pack_original(linear=layer, scales=q_scales, zeros=q_zeros, g_idx=q_g_idx)
             else:
                 module.pack_original(linear=layer, scales=q_scales, zeros=q_zeros, g_idx=q_g_idx)
 

diff --git a/tests/models/model_test.py b/tests/models/model_test.py
@@ -382,7 +382,13 @@ def perform_post_quant_validation(self, model_path, trust_remote_code=False):
         eval_records = {}
         reuse_candidates = {}
 
-        compare_backends = (BACKEND.MARLIN,) if self.FORMAT is FORMAT.GPTQ else (BACKEND.MARLIN, BACKEND.GEMM)
+        if self.FORMAT is FORMAT.GPTQ:
+            if self.LOAD_BACKEND == BACKEND.MARLIN:
+                compare_backends = (BACKEND.MARLIN,)
+            else:
+                compare_backends = (self.LOAD_BACKEND,)
+        else:
+            compare_backends = (BACKEND.MARLIN, BACKEND.GEMM)
         fallback_backend = None
         if BACKEND.MARLIN in compare_backends:
             try:

diff --git a/tests/models/test_granite_4_0_h_1b.py b/tests/models/test_granite_4_0_h_1b.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+from gptqmodel import BACKEND
+from model_test import ModelTest
+
+from gptqmodel.utils.eval import EVAL
+
+
+# a100:0, TORCH kernel
+# desc_act = False, act_group_aware = True
+# | Metric                         |   MARLIN |
+# |--------------------------------|----------|
+# | arc_challenge :: acc,none      |   0.3968 |
+# | arc_challenge :: acc_norm,none |   0.4138 |
+# | mmlu_stem :: acc,none          |   0.4015 |
+class Test_Granite_4_0_H_1B(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/granite-4.0-h-1b" # "ibm-granite/granite-4.0-h-1b"
+    GROUP_SIZE = 32
+    EVAL_BATCH_SIZE = 1
+    LOAD_BACKEND = BACKEND.TORCH
+    EVAL_TASKS = {
+        EVAL.LM_EVAL.ARC_CHALLENGE: {
+            "chat_template": True,
+            "acc": {
+                "value": 0.3968,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
+            "acc_norm": {
+                "value": 0.4138,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
+        },
+        EVAL.LM_EVAL.MMLU_STEM: {
+            "chat_template": False,
+            "acc": {
+                "value": 0.4015,
+                "floor_pct": 0.1,
+                "ceil_pct": 0.20,
+            },
+        },
+    }
+
+    def test_granite(self):
+        self.quant_lm_eval()
diff --git a/tests/models/test_granite_4_0_h_350m.py b/tests/models/test_granite_4_0_h_350m.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+from gptqmodel import BACKEND
+from model_test import ModelTest
+
+from gptqmodel.utils.eval import EVAL
+
+
+# a100:0, TORCH kernel
+# desc_act = False, act_group_aware = True
+# | Metric                         |   MARLIN |
+# |--------------------------------|----------|
+# | arc_challenge :: acc,none      |   0.3046 |
+# | arc_challenge :: acc_norm,none |   0.3157 |
+# | mmlu_stem :: acc,none          |   0.2915 |
+class Test_Granite_4_0_H_350M(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/granite-4.0-h-350m" # "ibm-granite/granite-4.0-h-350m"
+    GROUP_SIZE = 32
+    EVAL_BATCH_SIZE = 16
+    LOAD_BACKEND = BACKEND.TORCH
+    EVAL_TASKS = {
+        EVAL.LM_EVAL.ARC_CHALLENGE: {
+            "chat_template": True,
+            "acc": {
+                "value": 0.3046,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
+            "acc_norm": {
+                "value": 0.3157,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
+        },
+        EVAL.LM_EVAL.MMLU_STEM: {
+            "chat_template": False,
+            "acc": {
+                "value": 0.2915,
+                "floor_pct": 0.1,
+                "ceil_pct": 0.20,
+            },
+        },
+    }
+
+    def test_granite(self):
+        self.quant_lm_eval()