ModelCloud · Qubitium · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py
@@ -12,11 +12,13 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from logbar import LogBar
 from transformers import AutoTokenizer, GenerationConfig
 from transformers.generation.logits_process import LogitsProcessor
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+
+
 logger = LogBar.shared()
 
 random.seed(0)

diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py
@@ -9,6 +9,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
+
 try:
     from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
     bind_cores_for_best_perf()
@@ -18,6 +19,7 @@
 
 import argparse
 
+
 parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
 parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
 parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")

diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
@@ -6,9 +6,11 @@
 import argparse
 import os
 
-from gptqmodel.utils.perplexity import Perplexity
 from transformers import AutoTokenizer
 
+from gptqmodel.utils.perplexity import Perplexity
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 

diff --git a/examples/eora/eora_generation.py b/examples/eora/eora_generation.py
@@ -16,17 +16,20 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 
 # from models.model_test import ModelTest  # noqa: E402
 from eora_calibration_data_construction import construct_c4, construct_mmlu
+
 from gptqmodel import GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.adapter.adapter import Lora
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 
+
 ## meta-llama/Llama-3.2-1B
 ## meta-llama/Llama-3.2-3B
 ## meta-llama/Meta-Llama-3-8B

diff --git a/examples/eora/eora_load_and_inference.py b/examples/eora/eora_load_and_inference.py
@@ -16,13 +16,15 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.adapter.adapter import Lora  # noqa: E402
 
+
 if __name__ == '__main__':
     import argparse
 

diff --git a/examples/eora/evaluation.py b/examples/eora/evaluation.py
@@ -16,17 +16,19 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 from typing import Optional  # noqa: E402
 
+from lm_eval.utils import make_table  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.adapter.adapter import Lora  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
 
 
 def bench(path: str, backend: BACKEND, adapter: Optional[Lora], task):

diff --git a/examples/eora/post_quant_eora_generation.py b/examples/eora/post_quant_eora_generation.py
@@ -16,15 +16,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 
 from eora_calibration_data_construction import construct_ARC, construct_c4, construct_mmlu
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.adapter.adapter import Lora  # noqa: E402
 
+
 if __name__ == '__main__':
     import argparse
 

diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py
@@ -7,10 +7,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import LanguageModelingTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer
+
 
 DATASET = "tatsu-lab/alpaca"
 WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"

diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py
@@ -8,10 +8,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import SequenceClassificationTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer
+
 
 DATASET = "cardiffnlp/tweet_sentiment_multilingual"
 TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"

diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py
@@ -8,10 +8,12 @@
 
 import datasets
 import torch
+from transformers import AutoTokenizer, GenerationConfig
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
 from gptqmodel.eval_tasks import TextSummarizationTask
 from gptqmodel.utils.torch import torch_empty_cache
-from transformers import AutoTokenizer, GenerationConfig
+
 
 os.system("pip install py7zr")
 

diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
@@ -5,6 +5,7 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+
 tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))

diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py
@@ -8,9 +8,11 @@
 import sys
 from argparse import ArgumentParser
 
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 

diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py
@@ -5,9 +5,11 @@
 
 import os
 
-from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 

diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py
@@ -5,9 +5,11 @@
 
 import torch
 from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig
+
+
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"
 

diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
@@ -5,6 +5,7 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
+
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]

diff --git a/format/format.sh b/format/format.sh
@@ -8,7 +8,8 @@ pip install -U ruff==0.13.0 isort==6.0.1
 ruff check ../gptqmodel/models ../gptqmodel/nn_modules ../gptqmodel/quantization ../gptqmodel/utils ../gptqmodel/__init__.py ../examples ../tests ../setup.py --fix --unsafe-fixes
 ruff_status=$?
 
-isort -l 119 -e ../
+# isort is too slow
+# isort -l 119 -e ../
 
 # Exit with the status code of ruff check
 exit $ruff_status
diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
@@ -11,6 +11,7 @@
 from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
 
+
 if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
     try:
         from modelscope.utils.hf_util.patcher import patch_hub

diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
@@ -13,13 +13,13 @@
 from ..looper.loop_processor import LoopProcessor, get_max_memory
 from ..looper.named_module import NamedModule
 from ..models import BaseQModel
+from ..models._const import CPU
 from ..models.writer import (PROCESS_LOG_FWD_TIME, PROCESS_LOG_LAYER, PROCESS_LOG_MODULE, PROCESS_LOG_NAME,
                              PROCESS_LOG_TIME, PROCESS_MAX_MEMORY, QUANT_LOG_DAMP, QUANT_LOG_LOSS, QUANT_LOG_NSAMPLES)
 from ..quantization import GPTQ, GPTQv2
 from ..quantization.config import METHOD, QuantizeConfig
 from ..utils.importer import select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.memory import MEM_LORD
 from ..utils.model import create_quant_module, find_modules, move_to, pack_model, pack_module
 from ..utils.offload import undo_offload_to_disk
 from ..utils.torch import HAS_CUDA, torch_streamCtx, torch_sync
@@ -127,7 +127,10 @@ def process(self, module: NamedModule):
             g = self.tasks[module.name]
 
         wq, q_scales, q_zeros, q_g_idx, duration, avg_loss, damp_percent, nsamples = g.quantize()
-        MEM_LORD.free((q_scales, q_zeros, q_g_idx))
+
+        q_scales = q_scales.to(CPU)
+        q_zeros = q_zeros.to(CPU)
+        q_g_idx = q_g_idx.to(CPU)
 
         with self.lock:
             module.state.update({"q_scales": q_scales})
@@ -198,7 +201,7 @@ def process(self, module: NamedModule):
                 "wq": wq,  # fp16, quantized weight but not int4 (packed qweight)
             })
 
-        MEM_LORD.free(module.weight)
+        # single largest deallocation of vram happens here
         module.weight.data = wq
 
     # submodule_finalized is called in reverse after all next sequential processes are called
@@ -215,6 +218,10 @@ def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
             q_scales = module.state.pop("q_scales")
             q_g_idx = module.state.pop("q_g_idx")
 
+        assert q_zeros.device == CPU
+        assert q_scales.device == CPU
+        assert q_g_idx.device == CPU
+
         layers = find_modules(model.model)
 
         # replace module with quantized module
@@ -251,7 +258,6 @@ def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
         with self.lock:
             self.result_pop(module.full_name)
 
-        # MEM_LORD.free(module.weight)
         module.unregister_parameter("weight")
 
     def finalize(self, model: BaseQModel, **kwargs):
@@ -260,14 +266,12 @@ def finalize(self, model: BaseQModel, **kwargs):
             torch_sync()
 
         model.model = undo_offload_to_disk(module=model.model, include_buffers=True, delete_offload_folders=True)
-        MEM_LORD.free(model.model)
 
         # print("finalize")
         # print_module_tree(model.model)
 
         # set quantized state
         model.quantized = True
-
         model.quantize_config.quant_method = METHOD.GPTQ
 
         super().finalize(model=model, **kwargs)