diff --git a/README.md b/README.md index 7be7ae956..9365145a4 100644 --- a/README.md +++ b/README.md @@ -178,20 +178,21 @@ Native support support some of the most popular multi-modal models: ## Model Support -| Model | | | | | | | | | | -|-------------------|---|-------------------|---|----------------|---|----------------|---|---------------|---| -| Apertus | ✅ | EXAONE 3.0 | ✅ | InternLM 1/2.5 | ✅ | MobileLLM | ✅ | Qwen 2/2.5 VL | ✅ | -| Baichuan | ✅ | Falcon (H1) | ✅ | Kimi K2 | ✅ | MOSS | ✅ | Qwen 2.5/3 Omni | ✅ | -| Bloom | ✅ | FastVLM | ✅ | Klear | ✅ | MPT | ✅ | RefinedWeb | ✅ | -| ChatGLM | ✅ | Gemma 1/2/3 | ✅ | Llama 1-3.3 | ✅ | Nemotron H | ✅ | StableLM | ✅ | -| CodeGen | ✅ | GPTBigCod | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | StarCoder2 | ✅ | -| Cohere 1-2 | ✅ | GPTQ-Neo/GPT-NeoX | ✅ | Llama 4 | ✅ | OPT | ✅ | TeleChat2 | ✅ | -| DBRX Converted | ✅ | GPT-2 | ✅ | LongCatFlash | ✅ | OLMo2 | ✅ | Yi | ✅ | -| Deci | ✅ | GPT-J | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ | -| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | Instella | ✅ | Phi 1-4 | ✅ | XVERSE | ✅ | -| DeepSeek-V2-Lite | ✅ | Granite | ✅ | MiniCPM3 | ✅ | PanGu-α | ✅ | | | -| Dream | ✅ | GRIN-MoE | ✅ | Mistral | ✅ | Qwen 1/2/3 | ✅ | | | -| ERNIE 4.5 | ✅ | Hymba | ✅ | Mixtral | ✅ | Qwen 2/3 (Next/MoE) | ✅ | | | +| Model | | | | | | | | | | +|-------------------|---|-------------------|---|----------------|---|----------------|---|---------------------|---| +| Apertus | ✅ | EXAONE 3.0 | ✅ | InternLM 1/2.5 | ✅ | Mixtral | ✅ | Qwen 2/3 (Next/MoE) | ✅ | +| Baichuan | ✅ | Falcon (H1) | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2/2.5 VL | ✅ | +| Bloom | ✅ | FastVLM | ✅ | Klear | ✅ | MOSS | ✅ | Qwen 2.5/3 Omni | ✅ | +| ChatGLM | ✅ | Gemma 1/2/3 | ✅ | LING/RING | ✅ | MPT | ✅ | RefinedWeb | ✅ | +| CodeGen | ✅ | GPTBigCod | ✅ | Llama 1-3.3 | ✅ | Nemotron H | ✅ | StableLM | ✅ | +| Cohere 1-2 | ✅ | GPTQ-Neo/GPT-NeoX | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | StarCoder2 | ✅ | +| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 4 | ✅ | OPT | ✅ | TeleChat2 | ✅ | +| Deci | ✅ | GPT-J | ✅ | LongCatFlash | ✅ | OLMo2 | ✅ | Yi | ✅ | +| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ | +| DeepSeek-V2-Lite | ✅ | Granite | ✅ | Instella | ✅ | Phi 1-4 | ✅ | XVERSE | ✅ | +| Dream | ✅ | GRIN-MoE | ✅ | MiniCPM3 | ✅ | PanGu-α | ✅ | | | +| ERNIE 4.5 | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3 | ✅ | | | + ## Platform and HW Support diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index eef9d165e..a0ef3ea5a 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -118,6 +118,8 @@ from .definitions.starcoder2 import Starcoder2QModel # noqa: E402 from .definitions.telechat2 import TeleChat2QModel from .definitions.xverse import XverseQModel # noqa: E402 +from .definitions.bailing_moe import BailingMoeQModel # noqa: E402 + # make quants and inference more determinisitc @@ -208,6 +210,7 @@ "longcat_flash": LongCatFlashQModel, "llava_qwen2": LlavaQwen2QModel, "nemotron_h": NemotronHQModel, + "bailing_moe": BailingMoeQModel, } SUPPORTED_MODELS = list(MODEL_MAP.keys()) diff --git a/gptqmodel/models/definitions/bailing_moe.py b/gptqmodel/models/definitions/bailing_moe.py new file mode 100644 index 000000000..7cba27d48 --- /dev/null +++ b/gptqmodel/models/definitions/bailing_moe.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +from ..base import BaseQModel + + +class BailingMoeQModel(BaseQModel): + # allow dynamic expert index for layer_modules so we don't need to write out 64 layers here + # config.num_experts contains the actual expert count used for index + dynamic_expert_index = "num_experts" + layer_modules_strict = False + + pre_lm_head_norm_module = "model.norm" + + module_tree = [ + "model", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "self_attn": ("query_key_value"), + "post_attention_layernorm": ("post_attention_layernorm:!",), + "mlp": { + "gate": ("gate:!",), # <-- 0.5MB per layer. Not worth quantizing + "shared_expert": ("gate_proj", "up_proj", "down_proj"), + "experts": { + "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), + }, + }, + } + ]