diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 81f0de92a..2e5578034 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -419,7 +419,6 @@ def eval( load_kwargs = {} if llm_backend == "vllm": - load_backend = BACKEND.VLLM disallowed_keys = {"pretrained", "tokenizer", "gptqmodel", "trust_remote_code", "backend", "model_id_or_path"} load_kwargs = {k: v for k, v in model_args.items() if k not in disallowed_keys} diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index c7fca5666..39cc0c9ee 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -372,6 +372,8 @@ def from_quantized( qcfg.calculate_bits_per_weight() + tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) + if backend == BACKEND.VLLM or backend == BACKEND.SGLANG: if backend == BACKEND.VLLM: if qcfg.format != FORMAT.GPTQ and qcfg.format != FORMAT.GEMM: @@ -409,7 +411,10 @@ def from_quantized( model, quantized=True, quantize_config=qcfg, + tokenizer=tokenizer, qlinear_kernel=None, + load_quantized_model=True, + trust_remote_code=trust_remote_code, model_local_path=model_local_path, ) @@ -784,8 +789,6 @@ def assign(mod, device_id): model.eval() - tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code) - if backend == BACKEND.MLX: import tempfile try: