diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 4ee513de5..8f120fa05 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -168,7 +168,7 @@ def __init__( self.model = self.after_model_load(model, load_quantized_model=load_quantized_model) self.turtle_model = turtle_model - self.compiled = False # set to True while compile() is triggered successfully + self.compiled = False # set to True while compile() is triggered successfully self.quantized = quantized self.load_quantized_model = load_quantized_model if tokenizer is not None: diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py index 9aae554f9..e7b512049 100644 --- a/gptqmodel/models/writer.py +++ b/gptqmodel/models/writer.py @@ -258,7 +258,8 @@ def debug_saved_config(path): # --- end config save block --- # Due to shell/turtle state, we need to sync the modules from turtle to shell - alias_all_from_turtle_if_meta(shell_model=model, turtle_model=self.turtle_model) + if not self.load_quantized_model: + alias_all_from_turtle_if_meta(shell_model=model, turtle_model=self.turtle_model) state_dict = get_state_dict_for_save(model) diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py index e6a9259fa..90c40f4ea 100644 --- a/gptqmodel/quantization/config.py +++ b/gptqmodel/quantization/config.py @@ -257,9 +257,12 @@ def __post_init__(self): if self.damp_percent is None: if self.quant_method == METHOD.QQQ: self.damp_percent = 0.005 - self.damp_auto_increment = 0.001 else: self.damp_percent = 0.05 + if self.damp_auto_increment is None: + if self.quant_method == METHOD.QQQ: + self.damp_auto_increment = 0.001 + else: self.damp_auto_increment = 0.01 # TODO FIXME awq compat which didn't have checkpoint_format before merging to gptqmodel diff --git a/tests/test_serialization.py b/tests/test_serialization.py index b55bed512..77ffc52bd 100644 --- a/tests/test_serialization.py +++ b/tests/test_serialization.py @@ -32,7 +32,7 @@ def test_marlin_local_serialization(self): model = GPTQModel.load(tmpdir, device="cuda:0", backend=BACKEND.MARLIN) def test_gptq_v1_to_v2_runtime_convert(self): - model = GPTQModel.load(self.MODEL_ID, device="cuda:0") + model = GPTQModel.load(self.MODEL_ID, device="cuda:0", backend=BACKEND.EXLLAMA_V2) self.assertEqual(model.quantize_config.runtime_format, FORMAT.GPTQ_V2) def test_gptq_v1_serialization(self):