Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions gptqmodel/looper/awq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,34 @@ class _AWQLayerState:
lock: threading.Lock = field(default_factory=threading.Lock)

class AWQProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset_func,
calibration_concat_size: Optional[int], calibration_sort: Optional[str], batch_size: int, gptq_model, model,
require_fwd: bool = True, calculate_w_wq_diff: bool = False):

super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration=calibration,
calibration_concat_size=calibration_concat_size, calibration_sort=calibration_sort,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
require_fwd=require_fwd, fwd_after_process=False)
def __init__(
self,
tokenizer,
qcfg: QuantizeConfig,
calibration,
prepare_dataset_func,
calibration_concat_size: Optional[int],
calibration_sort: Optional[str],
batch_size: int,
gptq_model,
model,
require_fwd: bool = True,
calculate_w_wq_diff: bool = False,
calibration_concat_separator: Optional[str] = None,
):

super().__init__(
tokenizer=tokenizer,
qcfg=qcfg,
calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
calibration_concat_separator=calibration_concat_separator,
prepare_dataset_func=prepare_dataset_func,
batch_size=batch_size,
require_fwd=require_fwd,
fwd_after_process=False,
)

self.calculate_w_wq_diff = calculate_w_wq_diff
self.avg_losses = []
Expand Down
32 changes: 23 additions & 9 deletions gptqmodel/looper/eora_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,29 @@


class EoraProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset_func,
calibration_concat_size: Optional[int], calibration_sort: Optional[str], batch_size: int,
require_fwd: bool = True
):
super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
require_fwd=require_fwd)
def __init__(
self,
tokenizer,
qcfg: QuantizeConfig,
calibration,
prepare_dataset_func,
calibration_concat_size: Optional[int],
calibration_sort: Optional[str],
batch_size: int,
require_fwd: bool = True,
calibration_concat_separator: Optional[str] = None,
):
super().__init__(
tokenizer=tokenizer,
qcfg=qcfg,
calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
calibration_concat_separator=calibration_concat_separator,
prepare_dataset_func=prepare_dataset_func,
batch_size=batch_size,
require_fwd=require_fwd,
)

# Track per-module segment accumulators keyed by device so we can merge
# contributions without repeatedly moving data through the CPU.
Expand Down
34 changes: 25 additions & 9 deletions gptqmodel/looper/gptq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,31 @@
lock = threading.Lock()

class GPTQProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset_func,
calibration_concat_size: Optional[int], calibration_sort: Optional[str], batch_size: int,
require_fwd: bool = True, calculate_w_wq_diff: bool = False):

super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
require_fwd=require_fwd)
def __init__(
self,
tokenizer,
qcfg: QuantizeConfig,
calibration,
prepare_dataset_func,
calibration_concat_size: Optional[int],
calibration_sort: Optional[str],
batch_size: int,
require_fwd: bool = True,
calculate_w_wq_diff: bool = False,
calibration_concat_separator: Optional[str] = None,
):

super().__init__(
tokenizer=tokenizer,
qcfg=qcfg,
calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
calibration_concat_separator=calibration_concat_separator,
prepare_dataset_func=prepare_dataset_func,
batch_size=batch_size,
require_fwd=require_fwd,
)

self.calculate_w_wq_diff = calculate_w_wq_diff
self.avg_losses = []
Expand Down
12 changes: 8 additions & 4 deletions gptqmodel/looper/loop_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
prepare_dataset_func: Optional[Callable] = None,
calibration_concat_size: Optional[int] = None,
calibration_sort: Optional[str] = None,
calibration_concat_separator: Optional[str] = None,
batch_size: int = 1,
require_fwd: bool = True,
fwd_after_process: bool = True,
Expand Down Expand Up @@ -128,10 +129,13 @@ def __init__(
if prepare_dataset_func is None:
raise ValueError("prepare_dataset_func must be provided when calibration data is supplied.")

calibration = prepare_dataset_func(calibration_dataset=calibration,
calibration_dataset_concat_size=calibration_concat_size,
calibration_dataset_sort=calibration_sort,
batch_size=batch_size)
calibration = prepare_dataset_func(
calibration_dataset=calibration,
calibration_dataset_concat_size=calibration_concat_size,
calibration_dataset_sort=calibration_sort,
batch_size=batch_size,
calibration_concat_separator=calibration_concat_separator,
)

# Calculate the average length of the average input_ids
total_input_ids_length = 0
Expand Down
36 changes: 26 additions & 10 deletions gptqmodel/looper/native_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,32 @@

# v2 requires that we also need to capture/store non-quantized inputs
class NativeProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset_func,
calibration_concat_size: Optional[int], calibration_sort: Optional[str], batch_size: int,
require_fwd: bool = True):

super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
require_fwd=require_fwd, fwd_after_process=False,
fwd_all_modules_in_single_pass=True)
def __init__(
self,
tokenizer,
qcfg: QuantizeConfig,
calibration,
prepare_dataset_func,
calibration_concat_size: Optional[int],
calibration_sort: Optional[str],
batch_size: int,
require_fwd: bool = True,
calibration_concat_separator: Optional[str] = None,
):

super().__init__(
tokenizer=tokenizer,
qcfg=qcfg,
calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
calibration_concat_separator=calibration_concat_separator,
prepare_dataset_func=prepare_dataset_func,
batch_size=batch_size,
require_fwd=require_fwd,
fwd_after_process=False,
fwd_all_modules_in_single_pass=True,
)

self.native_inp_caches = {}

Expand Down
33 changes: 25 additions & 8 deletions gptqmodel/looper/qqq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,31 @@
log = setup_logger()

class QQQProcessor(LoopProcessor):
def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset_func,
calibration_concat_size: Optional[int], calibration_sort: Optional[str], batch_size: int,
require_fwd: bool = True, calculate_w_wq_diff: bool = False):

super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration=calibration,
calibration_concat_size=calibration_concat_size, calibration_sort=calibration_sort,
prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
require_fwd=require_fwd)
def __init__(
self,
tokenizer,
qcfg: QuantizeConfig,
calibration,
prepare_dataset_func,
calibration_concat_size: Optional[int],
calibration_sort: Optional[str],
batch_size: int,
require_fwd: bool = True,
calculate_w_wq_diff: bool = False,
calibration_concat_separator: Optional[str] = None,
):

super().__init__(
tokenizer=tokenizer,
qcfg=qcfg,
calibration=calibration,
calibration_concat_size=calibration_concat_size,
calibration_sort=calibration_sort,
calibration_concat_separator=calibration_concat_separator,
prepare_dataset_func=prepare_dataset_func,
batch_size=batch_size,
require_fwd=require_fwd,
)

self.calculate_w_wq_diff = calculate_w_wq_diff
self.avg_losses = []
Expand Down
2 changes: 0 additions & 2 deletions gptqmodel/models/_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,3 @@ def get_best_device(backend: BACKEND = BACKEND.AUTO) -> torch.device:
EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048

EXPERT_INDEX_PLACEHOLDER = "{expert_index}"

CALIBRATION_DATASET_CONCAT_CHAR = " "
4 changes: 3 additions & 1 deletion gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
from .definitions.gpt_neox import GPTNeoXQModel # noqa: E402
from .definitions.gpt_oss import GPTOSSGPTQ # noqa: E402
from .definitions.gptj import GptJQModel # noqa: E402
from .definitions.granitemoehybrid import GraniteMoeHybridQModel
from .definitions.grinmoe import GrinMoeQModel # noqa: E402
from .definitions.hymba import HymbaQModel # noqa: E402
from .definitions.instella import InstellaQModel # noqa: E402
Expand Down Expand Up @@ -139,7 +140,6 @@
from .definitions.starcoder2 import Starcoder2QModel # noqa: E402
from .definitions.telechat2 import TeleChat2QModel
from .definitions.xverse import XverseQModel # noqa: E402
from .definitions.granitemoehybrid import GraniteMoeHybridQModel


# make quants and inference more determinisitc
Expand Down Expand Up @@ -692,6 +692,7 @@ def generate(
calibration_dataset_sort: Optional[str] = None,
batch_size: Optional[int] = 1,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
calibration_concat_separator: Optional[str] = None,
# pass-through vars for load()
trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
Expand Down Expand Up @@ -736,5 +737,6 @@ def generate(
calibration_dataset_sort=calibration_dataset_sort,
batch_size=batch_size,
tokenizer=tokenizer,
calibration_concat_separator=calibration_concat_separator,
)
return
Loading