From 6c2758b1a515d1c63f6805e2278f620694e1cd68 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Wed, 22 Oct 2025 12:08:49 +0800 Subject: [PATCH 1/2] load_backend cannot use BACKEND.VLLM. lm-eval also uses vllm to load models, causing OOM errors. Signed-off-by: ZX-ModelCloud --- gptqmodel/models/auto.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 81f0de92a..2e5578034 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -419,7 +419,6 @@ def eval( load_kwargs = {} if llm_backend == "vllm": - load_backend = BACKEND.VLLM disallowed_keys = {"pretrained", "tokenizer", "gptqmodel", "trust_remote_code", "backend", "model_id_or_path"} load_kwargs = {k: v for k, v in model_args.items() if k not in disallowed_keys} From b71c13f2f1d1f0316c453ad9dfd24e839ae678f7 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Wed, 22 Oct 2025 12:09:23 +0800 Subject: [PATCH 2/2] fix tokenizer is None when load by BACKEND.VLLM Signed-off-by: ZX-ModelCloud --- gptqmodel/models/loader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index c7fca5666..39cc0c9ee 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -372,6 +372,8 @@ def from_quantized( qcfg.calculate_bits_per_weight() + tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) + if backend == BACKEND.VLLM or backend == BACKEND.SGLANG: if backend == BACKEND.VLLM: if qcfg.format != FORMAT.GPTQ and qcfg.format != FORMAT.GEMM: @@ -409,7 +411,10 @@ def from_quantized( model, quantized=True, quantize_config=qcfg, + tokenizer=tokenizer, qlinear_kernel=None, + load_quantized_model=True, + trust_remote_code=trust_remote_code, model_local_path=model_local_path, ) @@ -784,8 +789,6 @@ def assign(mod, device_id): model.eval() - tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) - if backend == BACKEND.MLX: import tempfile try: