From 6c2758b1a515d1c63f6805e2278f620694e1cd68 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Wed, 22 Oct 2025 12:08:49 +0800
Subject: [PATCH 1/2] load_backend cannot use BACKEND.VLLM.

lm-eval also uses vllm to load models, causing OOM errors.

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 gptqmodel/models/auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 81f0de92a..2e5578034 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -419,7 +419,6 @@ def eval(
             load_kwargs = {}
 
             if llm_backend == "vllm":
-                load_backend = BACKEND.VLLM
                 disallowed_keys = {"pretrained", "tokenizer", "gptqmodel", "trust_remote_code", "backend", "model_id_or_path"}
                 load_kwargs = {k: v for k, v in model_args.items() if k not in disallowed_keys}
 

From b71c13f2f1d1f0316c453ad9dfd24e839ae678f7 Mon Sep 17 00:00:00 2001
From: ZX-ModelCloud <zx@modelcloud.ai>
Date: Wed, 22 Oct 2025 12:09:23 +0800
Subject: [PATCH 2/2] fix tokenizer is None when load by BACKEND.VLLM

Signed-off-by: ZX-ModelCloud <zx@modelcloud.ai>
---
 gptqmodel/models/loader.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index c7fca5666..39cc0c9ee 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -372,6 +372,8 @@ def from_quantized(
 
         qcfg.calculate_bits_per_weight()
 
+        tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
+
         if backend == BACKEND.VLLM or backend == BACKEND.SGLANG:
             if backend == BACKEND.VLLM:
                 if qcfg.format != FORMAT.GPTQ and qcfg.format != FORMAT.GEMM:
@@ -409,7 +411,10 @@ def from_quantized(
                 model,
                 quantized=True,
                 quantize_config=qcfg,
+                tokenizer=tokenizer,
                 qlinear_kernel=None,
+                load_quantized_model=True,
+                trust_remote_code=trust_remote_code,
                 model_local_path=model_local_path,
             )
 
@@ -784,8 +789,6 @@ def assign(mod, device_id):
 
         model.eval()
 
-        tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-
         if backend == BACKEND.MLX:
             import tempfile
             try: