ModelCloud · Qubitium · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -419,7 +419,6 @@ def eval(
             load_kwargs = {}
 
             if llm_backend == "vllm":
-                load_backend = BACKEND.VLLM
                 disallowed_keys = {"pretrained", "tokenizer", "gptqmodel", "trust_remote_code", "backend", "model_id_or_path"}
                 load_kwargs = {k: v for k, v in model_args.items() if k not in disallowed_keys}
 

diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
@@ -372,6 +372,8 @@ def from_quantized(
 
         qcfg.calculate_bits_per_weight()
 
+        tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
+
         if backend == BACKEND.VLLM or backend == BACKEND.SGLANG:
             if backend == BACKEND.VLLM:
                 if qcfg.format != FORMAT.GPTQ and qcfg.format != FORMAT.GEMM:
@@ -409,7 +411,10 @@ def from_quantized(
                 model,
                 quantized=True,
                 quantize_config=qcfg,
+                tokenizer=tokenizer,
                 qlinear_kernel=None,
+                load_quantized_model=True,
+                trust_remote_code=trust_remote_code,
                 model_local_path=model_local_path,
             )
 
@@ -784,8 +789,6 @@ def assign(mod, device_id):
 
         model.eval()
 
-        tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
-
         if backend == BACKEND.MLX:
             import tempfile
             try: