Skip to content
Merged

Threadx #1945

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion examples/benchmark/generation_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@

import torch
from datasets import Dataset, load_dataset
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from logbar import LogBar
from transformers import AutoTokenizer, GenerationConfig
from transformers.generation.logits_process import LogitsProcessor

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig


logger = LogBar.shared()

random.seed(0)
Expand Down
2 changes: 2 additions & 0 deletions examples/benchmark/ipex.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


try:
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
bind_cores_for_best_perf()
Expand All @@ -18,6 +19,7 @@

import argparse


parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
Expand Down
4 changes: 3 additions & 1 deletion examples/benchmark/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
import argparse
import os

from gptqmodel.utils.perplexity import Perplexity
from transformers import AutoTokenizer

from gptqmodel.utils.perplexity import Perplexity


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Expand Down
3 changes: 3 additions & 0 deletions examples/eora/eora_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,20 @@
# -- do not touch
import os


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# -- end do not touch


# from models.model_test import ModelTest # noqa: E402
from eora_calibration_data_construction import construct_c4, construct_mmlu

from gptqmodel import GPTQModel, QuantizeConfig # noqa: E402
from gptqmodel.adapter.adapter import Lora
from gptqmodel.utils.torch import torch_empty_cache # noqa: E402


## meta-llama/Llama-3.2-1B
## meta-llama/Llama-3.2-3B
## meta-llama/Meta-Llama-3-8B
Expand Down
2 changes: 2 additions & 0 deletions examples/eora/eora_load_and_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
# -- do not touch
import os


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# -- end do not touch

from gptqmodel import BACKEND, GPTQModel # noqa: E402
from gptqmodel.adapter.adapter import Lora # noqa: E402


if __name__ == '__main__':
import argparse

Expand Down
4 changes: 3 additions & 1 deletion examples/eora/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@
# -- do not touch
import os


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# -- end do not touch

from typing import Optional # noqa: E402

from lm_eval.utils import make_table # noqa: E402

from gptqmodel import BACKEND, GPTQModel # noqa: E402
from gptqmodel.adapter.adapter import Lora # noqa: E402
from gptqmodel.utils.eval import EVAL # noqa: E402
from gptqmodel.utils.torch import torch_empty_cache # noqa: E402
from lm_eval.utils import make_table # noqa: E402


def bench(path: str, backend: BACKEND, adapter: Optional[Lora], task):
Expand Down
3 changes: 3 additions & 0 deletions examples/eora/post_quant_eora_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,18 @@
# -- do not touch
import os


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# -- end do not touch


from eora_calibration_data_construction import construct_ARC, construct_c4, construct_mmlu

from gptqmodel import GPTQModel # noqa: E402
from gptqmodel.adapter.adapter import Lora # noqa: E402


if __name__ == '__main__':
import argparse

Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation/run_language_modeling_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@

import datasets
import torch
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import LanguageModelingTask
from gptqmodel.utils.torch import torch_empty_cache
from transformers import AutoTokenizer


DATASET = "tatsu-lab/alpaca"
WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation/run_sequence_classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@

import datasets
import torch
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import SequenceClassificationTask
from gptqmodel.utils.torch import torch_empty_cache
from transformers import AutoTokenizer


DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
Expand Down
4 changes: 3 additions & 1 deletion examples/evaluation/run_text_summarization_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@

import datasets
import torch
from transformers import AutoTokenizer, GenerationConfig

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import TextSummarizationTask
from gptqmodel.utils.torch import torch_empty_cache
from transformers import AutoTokenizer, GenerationConfig


os.system("pip install py7zr")

Expand Down
1 change: 1 addition & 0 deletions examples/inference/run_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
Expand Down
4 changes: 3 additions & 1 deletion examples/inference/run_with_different_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
import sys
from argparse import ArgumentParser

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

import os

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage_wikitext2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

import torch
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig


pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"

Expand Down
1 change: 1 addition & 0 deletions examples/quantization/transformers_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig


model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
Expand Down
3 changes: 2 additions & 1 deletion format/format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ pip install -U ruff==0.13.0 isort==6.0.1
ruff check ../gptqmodel/models ../gptqmodel/nn_modules ../gptqmodel/quantization ../gptqmodel/utils ../gptqmodel/__init__.py ../examples ../tests ../setup.py --fix --unsafe-fixes
ruff_status=$?

isort -l 119 -e ../
# isort is too slow
# isort -l 119 -e ../

# Exit with the status code of ruff check
exit $ruff_status
1 change: 1 addition & 0 deletions gptqmodel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .utils.exllama import exllama_set_max_input_length
from .version import __version__


if os.getenv('GPTQMODEL_USE_MODELSCOPE', 'False').lower() in ['true', '1']:
try:
from modelscope.utils.hf_util.patcher import patch_hub
Expand Down
16 changes: 10 additions & 6 deletions gptqmodel/looper/gptq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
from ..looper.loop_processor import LoopProcessor, get_max_memory
from ..looper.named_module import NamedModule
from ..models import BaseQModel
from ..models._const import CPU
from ..models.writer import (PROCESS_LOG_FWD_TIME, PROCESS_LOG_LAYER, PROCESS_LOG_MODULE, PROCESS_LOG_NAME,
PROCESS_LOG_TIME, PROCESS_MAX_MEMORY, QUANT_LOG_DAMP, QUANT_LOG_LOSS, QUANT_LOG_NSAMPLES)
from ..quantization import GPTQ, GPTQv2
from ..quantization.config import METHOD, QuantizeConfig
from ..utils.importer import select_quant_linear
from ..utils.logger import setup_logger
from ..utils.memory import MEM_LORD
from ..utils.model import create_quant_module, find_modules, move_to, pack_model, pack_module
from ..utils.offload import undo_offload_to_disk
from ..utils.torch import HAS_CUDA, torch_streamCtx, torch_sync
Expand Down Expand Up @@ -127,7 +127,10 @@ def process(self, module: NamedModule):
g = self.tasks[module.name]

wq, q_scales, q_zeros, q_g_idx, duration, avg_loss, damp_percent, nsamples = g.quantize()
MEM_LORD.free((q_scales, q_zeros, q_g_idx))

q_scales = q_scales.to(CPU)
q_zeros = q_zeros.to(CPU)
q_g_idx = q_g_idx.to(CPU)

with self.lock:
module.state.update({"q_scales": q_scales})
Expand Down Expand Up @@ -198,7 +201,7 @@ def process(self, module: NamedModule):
"wq": wq, # fp16, quantized weight but not int4 (packed qweight)
})

MEM_LORD.free(module.weight)
# single largest deallocation of vram happens here
module.weight.data = wq

# submodule_finalized is called in reverse after all next sequential processes are called
Expand All @@ -215,6 +218,10 @@ def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
q_scales = module.state.pop("q_scales")
q_g_idx = module.state.pop("q_g_idx")

assert q_zeros.device == CPU
assert q_scales.device == CPU
assert q_g_idx.device == CPU

layers = find_modules(model.model)

# replace module with quantized module
Expand Down Expand Up @@ -251,7 +258,6 @@ def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
with self.lock:
self.result_pop(module.full_name)

# MEM_LORD.free(module.weight)
module.unregister_parameter("weight")

def finalize(self, model: BaseQModel, **kwargs):
Expand All @@ -260,14 +266,12 @@ def finalize(self, model: BaseQModel, **kwargs):
torch_sync()

model.model = undo_offload_to_disk(module=model.model, include_buffers=True, delete_offload_folders=True)
MEM_LORD.free(model.model)

# print("finalize")
# print_module_tree(model.model)

# set quantized state
model.quantized = True

model.quantize_config.quant_method = METHOD.GPTQ

super().finalize(model=model, **kwargs)
Expand Down
Loading