diff --git a/.gitignore b/.gitignore
index 12e0533b1..cc3d0fea7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -183,3 +183,4 @@ example.py
 /gptqmodel_ext/marlin/kernel_fp16_ku8b128.cu
 /gptqmodel_offload/
 /gptqmodel_ext/machete/generated/
+AGENT.md
diff --git a/README.md b/README.md
index 5fbd14aa8..f9a30b639 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,8 @@
 </p>
 
 ## Latest News
-* 10/30/2025 5.1.0-dev: +Marin model. +AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor. 
+* 10/30/2025 5.1.0-dev: 🎉AWQ support out of beta with full feature support in including multi-gpu quant and MoE vram saving.  
+* 10/30/2025 5.1.0-dev: ✨Marin model. New AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor. 
 * 10/28/2025 5.1.0-dev: Minimax M2 support with [ModelCloud BF16 M2 Model](https://huggingface.co/ModelCloud/MiniMax-M2-BF16). New `VramStrategy.Balanced` quantization property for reduced memory usage for large MoE on multi-3090 (24GB) devices.
 * 10/24/2025 [5.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v5.0.0): 🎉 Data-parallel quant support for `MoE` models on multi-gpu using `nogil` Python. `offload_to_disk` support enabled by 
 default to massively reduce `cpu` ram usage. New `Intel` and `AMD` cpu hw accelerated `TorchFused` kernel. Packing stage is now 4x faster and now inlined with quantization. `Vram` pressure for large models reduced during quantization.
@@ -135,12 +136,12 @@ GPT-QModel not only supports GPTQ but also QQQ, GPTQv2, Eora with more quantizat
 
 GPT-QModel is a modular design supporting multiple quantization methods and feature extensions.
 
-| Quantization Feature      | GPT-QModel | Transformers | vLLM  | SGLang | Lora Training |
+| Quantization Feature      | GPT-QModel | Transformers | vLLM | SGLang | Lora Training |
 |---------------------------|------------|---|---|---|---------------|
 | GPTQ                      | ✅          | ✅ | ✅ | ✅ | ✅             | 
+| AWQ                       | ✅          | ✅ | ✅ | ✅ | ✅             |
 | EoRA                      | ✅          | ✅ | ✅ | ✅ | x             | 
 | Group Aware Act Reordering | ✅          | ✅ | ✅ | ✅ | ✅             |
-| AWQ                       | ✅          | ✅* | ✅* | ✅* | ✅*             | 
 | QQQ                       | ✅          | x | x | x | x             | 
 | Rotation                  | ✅          | x | x | x | x             |  
 | GPTAQ                     | ✅          | ✅ | ✅ | ✅ | ✅             | 
diff --git a/gptqmodel/looper/awq_processor.py b/gptqmodel/looper/awq_processor.py
index 3ad6b8a51..06ecdc64a 100644
--- a/gptqmodel/looper/awq_processor.py
+++ b/gptqmodel/looper/awq_processor.py
@@ -3,12 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
-import functools
 import inspect
 import math
+import threading
 import time
-from collections import defaultdict
-from typing import Callable, Dict, List, Optional, Tuple
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -25,16 +25,28 @@
 from ..nn_modules.qlinear.awq_marlin import AwqMarlinQuantLinear
 from ..quantization.awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV, WQLinear_GEMVFast, WQLinear_Marlin
 from ..quantization.awq.quantize.scale import apply_clip, apply_scale
-from ..quantization.awq.utils.module import append_str_prefix, get_op_name, set_op_by_name
+from ..quantization.awq.utils.module import append_str_prefix, get_op_name, get_op_by_name, set_op_by_name
 from ..quantization.awq.utils.utils import get_best_device
 from ..quantization.config import FORMAT, METHOD, QuantizeConfig
 from ..utils.logger import setup_logger
 from ..utils.ctx import ctx
-from ..utils.model import get_module_by_name_prefix, move_to
-from ..utils.torch import CPU, tf32_disable_guard, tf32_enable_guard, torch_sync
+from ..utils.model import find_modules, get_module_by_name_prefix, move_to
+from ..utils.torch import CPU
 
 log = setup_logger()
 
+
+@dataclass
+class _AWQLayerState:
+    modules: Dict[str, NamedModule] = field(default_factory=dict)
+    subset_total: Optional[int] = None
+    processed_subsets: Set[int] = field(default_factory=set)
+    layer_module: Optional[torch.nn.Module] = None
+    previous_weight_scale: Optional[float] = None
+    quantized: bool = False
+    pending_modules: Set[str] = field(default_factory=set)
+    lock: threading.Lock = field(default_factory=threading.Lock)
+
 class AWQProcessor(LoopProcessor):
     def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset_func,
                  calibration_concat_size: Optional[int], calibration_sort: Optional[str], batch_size: int, gptq_model, model,
@@ -43,12 +55,17 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset
         super().__init__(tokenizer=tokenizer, qcfg=qcfg, calibration=calibration,
                          calibration_concat_size=calibration_concat_size, calibration_sort=calibration_sort,
                          prepare_dataset_func=prepare_dataset_func, batch_size=batch_size,
-                         require_fwd=require_fwd)
+                         require_fwd=require_fwd, fwd_after_process=False)
 
         self.calculate_w_wq_diff = calculate_w_wq_diff
         self.avg_losses = []
         self.nsamples = 0
+        self._nsamples_total = 0
+        self._quant_batch_size = batch_size
 
+        self._layer_states: Dict[int, _AWQLayerState] = {}
+        self._layer_states_lock = threading.Lock()
+        self._scale_context = threading.local()
         self.gptq_model = gptq_model
         self.model = model
         # Whether to apply clipping to the model during quantization. Some models may perform better with this set to False.
@@ -58,168 +75,465 @@ def __init__(self, tokenizer, qcfg: QuantizeConfig, calibration, prepare_dataset
         # " Default is 1GB (1024 * 1024 * 1024)."
         self.max_chunk_memory = 1024 * 1024 * 1024
 
-        # "The number of parallel samples to run through the model. "
-        # "A high number of parallel samples can result in OOM during quantization if max_calib_samples is high enough. "
-        # "If None, runs through all samples at the same time. "
-        # "You can set this to a low number for more memory efficient quantization."
-        self.n_parallel_calib_samples = None if batch_size == 1 else batch_size
-
         # This argument avoids real quantization by only applying the scales without quantizing down to FP16.
         self.export_compatible = False
 
         self.version = qcfg.format
 
-        # TODO Can it be configured?
-        # The maximum sequence length of the calibration dataset. Discard samples greater than max_calib_seq_len.
-        self.max_calib_seq_len = 512
-
         # Whether to scale using both w/x or just x.
         self.duo_scaling = True
 
-        self.modules, self.module_kwargs, self.inps = self.init_quant()
+        self._module_forward_kwargs: Dict[str, torch.Tensor] = {}
+        self._initialize_sample_counts()
+        self._module_forward_kwargs.setdefault("attention_mask", None)
 
     def set_calibration_dataset(self, calibration_dataset):
         raise NotImplementedError("AWQProcessor's calibration_dataset cannot be modified")
 
-    def init_quant(self):
-        modules, _ = get_module_by_name_prefix(self.gptq_model.model, self.gptq_model.extract_layers_node())
-        # make sure samples tensor's shape is [1, max_calib_seq_len]
-        samples = [data['input_ids'][:, :self.max_calib_seq_len] for data in self.calibration_dataset if data['input_ids'].shape[1] >= self.max_calib_seq_len]
-
-        samples = torch.cat(samples, dim=0)
-
-        inps = []
-        layer_kwargs = {}
-
-        best_device = get_best_device()
-        modules[0] = self.gptq_model.pre_quantize(modules[0])
-        modules[0] = modules[0].to(best_device)
-
-        # embed should be on same gpu/best device
-        self.gptq_model.move_embed(best_device)
-
-        # get input and kwargs to layer 0
-        # with_kwargs is only supported in PyTorch 2.0
-        # use this Catcher hack for now
-        class Catcher(nn.Module):
-            def __init__(self, module):
-                super().__init__()
-                self.module = module
-
-            def forward(self, *args, **kwargs):
-                # assume first input to forward is hidden states
-                if len(args) > 0:
-                    hidden_states = args[0]
-                    del args
+    def _get_layer_state(self, layer_index: int) -> _AWQLayerState:
+        with self._layer_states_lock:
+            state = self._layer_states.get(layer_index)
+            if state is None:
+                state = _AWQLayerState()
+                self._layer_states[layer_index] = state
+        return state
+
+    def _initialize_sample_counts(self) -> None:
+        total = 0
+        dataset = getattr(self, "calibration_dataset", None)
+        if dataset is None:
+            self._nsamples_total = 0
+            self.nsamples = 0
+            return
+
+        for row in dataset:
+            if not isinstance(row, dict):
+                continue
+            input_ids = row.get("input_ids")
+            if input_ids is None:
+                continue
+            if isinstance(input_ids, torch.Tensor):
+                if input_ids.dim() <= 1:
+                    total += 1
                 else:
-                    first_key = list(kwargs.keys())[0]
-                    hidden_states = kwargs.pop(first_key)
+                    total += input_ids.shape[0]
+            else:
+                try:
+                    total += len(input_ids)
+                except TypeError:
+                    total += 1
 
-                inps.append(hidden_states)
-                layer_kwargs.update(kwargs)
-                raise ValueError  # early exit to break later inference
+        self._nsamples_total = total
+        self.nsamples = total
 
-        # patch layer 0 to catch input and kwargs
-        modules[0] = Catcher(modules[0])
-        try:
-            # If use_cache=True, layer_kwargs will contain past_key_values instead of attention_mask.
-            # Autoawq does not pass the use_cache parameter here.
-            # I haven't found the root cause yet.
-            
-            # Check if model parameters are on meta device and use best_device instead
-            # to avoid torch.autocast(device_type="meta") error in transformers
-            model_device = next(self.model.parameters()).device
-            if model_device.type == "meta":
-                target_device = best_device
+    def _record_input_feature(self, module_name: str, feature: torch.Tensor) -> None:
+        if feature.device.type != "cpu":
+            feature = feature.detach().cpu()
+        else:
+            feature = feature.detach()
+
+        with self.lock:
+            entry = self.tasks.get(module_name)
+            if entry is None:
+                entry = {"inputs": []}
+                self.tasks[module_name] = entry
+            inputs_list = entry.setdefault("inputs", [])
+            inputs_list.append(feature)
+
+    def _capture_previous_subset_scale(self, previous_subset: Optional[Dict[str, NamedModule]]) -> Optional[float]:
+        if not previous_subset:
+            return None
+
+        values: List[float] = []
+        for named_module in previous_subset.values():
+            weight = getattr(named_module.module, "weight", None)
+            if weight is None:
+                continue
+            with torch.no_grad():
+                values.append(float(weight.detach().abs().mean().item()))
+
+        if not values:
+            return None
+
+        return float(sum(values) / len(values))
+
+    def _layer_input_features(self, state: _AWQLayerState) -> Dict[str, torch.Tensor]:
+        features: Dict[str, torch.Tensor] = {}
+        root_buckets: Dict[str, List[torch.Tensor]] = {}
+        for name in state.modules:
+            entry = self.tasks.get(name) or {}
+            tensors: List[torch.Tensor] = entry.get("inputs", [])  # type: ignore[arg-type]
+            if not tensors:
+                features[name] = torch.empty(0)
+                continue
+            try:
+                features[name] = torch.cat(tensors, dim=0)
+            except RuntimeError:
+                features[name] = tensors[0]
+            root = name.split(".", 1)[0]
+            root_buckets.setdefault(root, []).extend(tensors)
+            if features[name] is not None and features[name].numel() > 0:
+                pass  # previously logged input feature shapes
+                # log.info(
+                #     "AWQProcessor: input feature `%s` shape=%s",
+                #     name,
+                #     tuple(features[name].shape),
+                # )
+
+        for root, tensors in root_buckets.items():
+            if not tensors or root in features:
+                continue
+            try:
+                features[root] = torch.cat(tensors, dim=0)
+            except RuntimeError:
+                features[root] = tensors[0]
+        return features
+
+    def _refresh_forward_kwargs_from_cache(self) -> None:
+        cache = getattr(self, "inputs_cache", None)
+        if cache is None:
+            return
+
+        refreshed: Dict[str, torch.Tensor] = {}
+
+        if getattr(cache, "attention_masks", None):
+            mask = cache.attention_masks[-1]
+            refreshed["attention_mask"] = mask
+        else:
+            refreshed["attention_mask"] = None
+
+        rotary = getattr(getattr(self.model, "model", self.model), "rotary_emb", None)
+        pos_ids_cache = cache.position_ids[-1] if getattr(cache, "position_ids", None) else None
+        hidden_cache = None
+        if getattr(cache, "layer_inputs", None):
+            last_inputs = cache.layer_inputs[-1]
+            if last_inputs:
+                hidden_cache = last_inputs[0]
+
+        if rotary is not None and hidden_cache is not None:
+            x_for_rotary = hidden_cache
+            if x_for_rotary.dim() == 2:
+                x_for_rotary = x_for_rotary.unsqueeze(0)
+            seq_len = x_for_rotary.shape[1] if x_for_rotary.dim() >= 2 else x_for_rotary.shape[0]
+            batch = x_for_rotary.shape[0] if x_for_rotary.dim() >= 2 else 1
+
+            target_device = getattr(getattr(rotary, "inv_freq", None), "device", None)
+            if target_device is not None and x_for_rotary.device != target_device:
+                x_for_rotary = x_for_rotary.to(target_device)
+
+            if pos_ids_cache is not None and pos_ids_cache.shape[-1] == seq_len:
+                pos_for_rotary = pos_ids_cache.to(x_for_rotary.device)
             else:
-                target_device = model_device
+                pos_for_rotary = torch.arange(seq_len, device=x_for_rotary.device, dtype=torch.long)
+                pos_for_rotary = pos_for_rotary.unsqueeze(0).expand(batch, -1)
+
+            refreshed["position_ids"] = pos_for_rotary
+            try:
+                pe = rotary(x_for_rotary, pos_for_rotary)
+                refreshed["position_embeddings"] = pe
+            except Exception:
+                pass
+        elif pos_ids_cache is not None:
+            refreshed["position_ids"] = pos_ids_cache
+
+        if getattr(cache, "layer_input_kwargs", None):
+            latest_kwargs = cache.layer_input_kwargs[-1] or {}
+            for key, value in latest_kwargs.items():
+                refreshed[key] = value
+
+        self._module_forward_kwargs = refreshed
+
+    def _quantize_layer(self, layer_index: int, state: _AWQLayerState) -> None:
+        with state.lock:
+            if state.quantized:
+                return
+
+            layer_module = state.layer_module
+            if layer_module is None and state.modules:
+                sample_module = next(iter(state.modules.values()))
+                layer_path = sample_module.full_name.rsplit(".", 1)[0]
+                layer_module, _ = get_module_by_name_prefix(self.gptq_model.model, layer_path)
+                state.layer_module = layer_module
+
+            layer_module_ref = state.layer_module
+
+            if layer_module_ref is None:
+                raise RuntimeError(f"AWQProcessor: unable to resolve layer module for layer index {layer_index}")
+
+        log.info(
+            "AWQProcessor: layer %s tracking %s modules before quantization (subsets processed=%s/%s); first modules=%s",
+            layer_index,
+            len(state.modules),
+            len(state.processed_subsets),
+            state.subset_total,
+            list(state.modules.keys())[:8],
+        )
+
+        input_feat = self._layer_input_features(state)
+        missing = [name for name, tensor in input_feat.items() if tensor.numel() == 0]
+        if missing:
+            log.warning(
+                "AWQProcessor: layer %s skipping %d modules with missing features (sample=%s)",
+                layer_index,
+                len(missing),
+                missing[:8],
+            )
+            # Drop modules with no captured activations so the layer can still quantize the rest
+            for name in missing:
+                input_feat.pop(name, None)
+                with state.lock:
+                    state.modules.pop(name, None)
+                    state.pending_modules.discard(name)
+                task_entry = self.tasks.pop(name, None)
+                if task_entry and "inputs" in task_entry:
+                    task_entry["inputs"].clear()
+
+            with state.lock:
+                remaining_modules = dict(state.modules)
+
+            if not remaining_modules:
+                log.warning(
+                    "AWQProcessor: layer %s has no modules with captured activations; marking quantized.",
+                    layer_index,
+                )
+                with state.lock:
+                    state.quantized = True
+                    state.processed_subsets.clear()
+                    state.subset_total = None
+                    state.previous_weight_scale = None
+                if hasattr(self._scale_context, "layer_index"):
+                    delattr(self._scale_context, "layer_index")
+                if hasattr(self._scale_context, "prev_scale"):
+                    delattr(self._scale_context, "prev_scale")
+                return
+
+        with state.lock:
+            named_childs = dict(state.modules)
+
+        module_kwargs_global = dict(self._module_forward_kwargs)
+
+        setattr(self._scale_context, "layer_index", layer_index)
+        setattr(self._scale_context, "prev_scale", state.previous_weight_scale)
+
+        while True:
+            try:
+                module_config = self.gptq_model.awq_get_modules_for_scaling(
+                    layer_module_ref,
+                    input_feat,
+                    module_kwargs_global,
+                )
+                break
+            except KeyError as missing_key:
+                missing_name = missing_key.args[0]
+                if missing_name in input_feat or not input_feat:
+                    raise
+                surrogate = next(iter(input_feat.values()))
+                input_feat[missing_name] = surrogate
+                log.debug(
+                    "AWQProcessor: layer %s using surrogate activation for missing module `%s`.",
+                    layer_index,
+                    missing_name,
+                )
 
-            print(f"AWQProcessor: model parameters are on meta device, using {target_device} instead")
+        if not module_config:
+            log.warning(
+                "AWQProcessor: no module configuration generated for layer index %s; skipping quantization.",
+                layer_index,
+            )
+            with state.lock:
+                state.quantized = True
+                state.modules.clear()
+                state.pending_modules.clear()
+                state.layer_module = None
+                state.processed_subsets.clear()
+                state.subset_total = None
+                state.previous_weight_scale = None
+            if hasattr(self._scale_context, "layer_index"):
+                delattr(self._scale_context, "layer_index")
+            if hasattr(self._scale_context, "prev_scale"):
+                delattr(self._scale_context, "prev_scale")
+            return
+
+        sanitized_module_config: List[Dict] = []
+        for entry in module_config:
+            entry = dict(entry)
+            inspect_module = entry.get("module2inspect") or layer_module_ref
+            entry_kwargs = entry.get("kwargs") or module_kwargs_global
+            entry["kwargs"] = self._sanitize_kwargs(entry_kwargs, inspect_module)
+            sanitized_module_config.append(entry)
+
+        filtered_module_config: List[Dict] = []
+        skipped_groups: List[Tuple[List[str], List[str]]] = []
+        for cfg in sanitized_module_config:
+            layers_sample = cfg.get("layers") or []
+            prev_module = cfg.get("prev_op")
+            first_layer_module = layers_sample[0] if layers_sample else None
+            # Some configs alias prev_op to the first layer (e.g. gate_proj); treat that as valid
+            same_module = prev_module is first_layer_module
+            if (
+                isinstance(prev_module, nn.Linear)
+                and isinstance(first_layer_module, nn.Linear)
+                and not same_module
+                and prev_module.weight.shape[0] != first_layer_module.weight.shape[1]
+            ):
+                try:
+                    prev_name = get_op_name(layer_module_ref, prev_module)
+                except Exception:
+                    prev_name = str(prev_module)
+                try:
+                    first_name = get_op_name(layer_module_ref, first_layer_module)
+                except Exception:
+                    first_name = str(first_layer_module)
+                log.debug(
+                    "AWQProcessor: layer %s skipping scaling group due to dimension mismatch prev_op=%s shape=%s first_layer=%s shape=%s",
+                    layer_index,
+                    prev_name,
+                    tuple(prev_module.weight.shape),
+                    first_name,
+                    tuple(first_layer_module.weight.shape),
+                )
+                continue
+            layer_names = [
+                get_op_name(layer_module_ref, layer) if isinstance(layer, torch.nn.Module) else str(layer)
+                for layer in layers_sample
+            ]
+            missing_layers = [name for name in layer_names if name not in input_feat]
+            if missing_layers:
+                skipped_groups.append((layer_names, missing_layers))
+                continue
+            filtered_module_config.append(cfg)
+
+        if skipped_groups:
+            log.debug(
+                "AWQProcessor: layer %s skipping %d scaling groups due to missing features (sample=%s)",
+                layer_index,
+                len(skipped_groups),
+                skipped_groups[:3],
+            )
 
-            self.model(samples.to(torch.device(target_device)), use_cache=False)
-        except ValueError:  # work with early exit
-            pass
-        modules[0] = modules[0].module  # restore
-
-        # Update the layer kwargs with `prepare_inputs_for_generation` method
-        # that takes care of everything to avoid unexpected errors.
-        layer_kwargs = self.model.prepare_inputs_for_generation(samples, **layer_kwargs)
-        # Pop the input_ids as they are not needed at all.
-        layer_kwargs.pop("input_ids")
-
-        del samples
-        inps = inps[0]
-        
-        # we no longer need embed, reduce vram
-        self.gptq_model.move_embed("cpu")
-
-        if layer_kwargs.get("attention_mask") is not None:
-            layer_kwargs["attention_mask"] = layer_kwargs["attention_mask"].to(
-                best_device
+        sanitized_module_config = filtered_module_config
+        if not sanitized_module_config:
+            log.warning(
+                "AWQProcessor: no valid scaling groups for layer %s after filtering; marking layer as quantized.",
+                layer_index,
             )
-        elif "qwen" in self.model.config.model_type:
-            layer_kwargs["attention_mask"] = None
-
-        return modules, layer_kwargs, inps
-
-    def _get_input_feat(self, layer, named_linears):
-        # firstly, get input features of all linear layers
-        def cache_input_hook(m, x, y, name, feat_dict):
-            x = x[0]
-            x = x.detach().cpu()
-            feat_dict[name].append(x)
-
-        input_feat = defaultdict(list)
-        handles = []
-
-        # FIXME: Workaround for Mixtral to use block_sparse_moe input features
-        if self.model.config.model_type == "mixtral":
-            named_linears = {
-                **named_linears,
-                "block_sparse_moe": layer.block_sparse_moe,
-            }
+            with state.lock:
+                state.quantized = True
+                state.processed_subsets.clear()
+                state.subset_total = None
+                state.previous_weight_scale = None
+            if hasattr(self._scale_context, "layer_index"):
+                delattr(self._scale_context, "layer_index")
+            if hasattr(self._scale_context, "prev_scale"):
+                delattr(self._scale_context, "prev_scale")
+            return
+
+        sample_groups = []
+        for cfg in sanitized_module_config[:3]:
+            layers_sample = cfg.get("layers") or []
+            layers_names = [get_op_name(layer_module_ref, layer) if isinstance(layer, torch.nn.Module) else str(layer) for layer in layers_sample[:4]]
+            sample_groups.append(layers_names)
+
+        log.info(
+            "AWQProcessor: layer %s sanitized %d scaling groups; sample=%s",
+            layer_index,
+            len(sanitized_module_config),
+            sample_groups,
+        )
 
-        if self.model.config.model_type == "deepseek_v2" or self.model.config.model_type == "deepseek_v3":
-            named_linears = {
-                **named_linears,
-                "mlp": layer.mlp,
-            }
+        scales_list = [
+            self._search_best_scale(layer_module_ref, **layer)
+            for layer in sanitized_module_config
+        ]
 
-        if self.model.config.model_type == "qwen3_moe":
-            named_linears = {
-                **named_linears,
-                "mlp": layer.mlp,
-            }
+        try:
+            apply_scale(layer_module_ref, scales_list, input_feat_dict=input_feat)
+        except RuntimeError as exc:
+            debug_entries = []
+            for prev_op_name, layer_names, scales, _loss in scales_list:
+                entry = {"prev": prev_op_name, "prev_shape": None, "layer_shapes": [], "scale_elems": None}
+                try:
+                    prev_module = get_op_by_name(layer_module_ref, prev_op_name)
+                except Exception:
+                    prev_module = None
+                weight = getattr(prev_module, "weight", None) if prev_module is not None else None
+                if isinstance(weight, torch.Tensor):
+                    entry["prev_shape"] = tuple(weight.shape)
+                elif prev_module is None:
+                    entry["prev_shape"] = "missing"
+                else:
+                    entry["prev_shape"] = f"{type(prev_module).__name__} (no weight)"
+
+                layer_shapes = []
+                for lname in layer_names:
+                    try:
+                        layer_module = get_op_by_name(layer_module_ref, lname)
+                    except Exception:
+                        layer_shapes.append((lname, "missing"))
+                        continue
+                    weight = getattr(layer_module, "weight", None)
+                    if isinstance(weight, torch.Tensor):
+                        layer_shapes.append((lname, tuple(weight.shape)))
+                    else:
+                        layer_shapes.append((lname, f"{type(layer_module).__name__} (no weight)"))
+                entry["layer_shapes"] = layer_shapes
+                if hasattr(scales, "numel"):
+                    try:
+                        entry["scale_elems"] = int(scales.numel())
+                    except Exception:
+                        entry["scale_elems"] = "unknown"
+                debug_entries.append(entry)
+
+            log.error(
+                "AWQProcessor: apply_scale failed at layer %s with %s. Shape summary (first 5 groups): %s",
+                layer_index,
+                exc,
+                debug_entries[:5],
+            )
+            raise
+        scales_list = append_str_prefix(
+            scales_list,
+            get_op_name(self.model, layer_module_ref) + ".",
+        )
 
-        for name in named_linears:
-            handles.append(
-                named_linears[name].register_forward_hook(
-                    functools.partial(cache_input_hook, name=name, feat_dict=input_feat)
-                )
+        clip_list = None
+        if self.apply_clip:
+            clip_list = self._search_best_clip(
+                layer_module_ref,
+                {name: named.module for name, named in named_childs.items()},
+                input_feat,
             )
-        self.inps = self.inps.to(next(layer.parameters()).device)  # in case multi-gpu
-        # get output as next layer's input
-
-        # Sanitize the kwargs in case we use transformers version that contains
-        # kwargs that are not handled by the module.
-        # Useful for trust_remote_code models.
-        module_kwargs = self._sanitize_kwargs(self.module_kwargs, layer)
-
-        self.inps = self._module_forward(self.inps, layer, module_kwargs)
-        for h in handles:
-            h.remove()
-
-        # now solve for scaling and clipping
-        def cat_and_assert(k, v):
-            x = torch.cat(v, dim=0)
-            assert x.shape[0] != 0, (
-                f"{k} has a zero dimension. This can happen if no data was passed through (e.g. an expert in MoE not being activated). "
-                "Try increasing max_calib_samples (warning: this can significantly increase quantization time and memory usage.)"
+            apply_clip(layer_module_ref, clip_list)
+            clip_list = append_str_prefix(
+                clip_list,
+                get_op_name(self.model, layer_module_ref) + ".",
             )
-            return x
 
-        input_feat = {k: cat_and_assert(k, v) for k, v in input_feat.items()}
-        return input_feat
+        named_childs = {name: named for name, named in named_childs.items() if name in input_feat}
+
+        if not self.export_compatible:
+            start = time.time()
+            self._apply_quant(layer_module_ref, named_childs, start, scales_list)
+
+        with state.lock:
+            state.quantized = True
+            state.modules.clear()
+            state.pending_modules.clear()
+            state.layer_module = None
+            state.processed_subsets.clear()
+            state.subset_total = None
+            state.previous_weight_scale = None
+
+        with self.lock:
+            for name in named_childs:
+                task_entry = self.tasks.pop(name, None)
+                if task_entry and "inputs" in task_entry:
+                    task_entry["inputs"].clear()
+
+        if hasattr(self._scale_context, "layer_index"):
+            delattr(self._scale_context, "layer_index")
+        if hasattr(self._scale_context, "prev_scale"):
+            delattr(self._scale_context, "prev_scale")
 
     @torch.inference_mode()
     def _search_best_scale(
@@ -231,8 +545,6 @@ def _search_best_scale(
             module2inspect=None,
             kwargs={},
     ):
-        self.nsamples += inp.shape[0]
-
         if module2inspect is None:
             assert len(layers) == 1
             module2inspect = layers[0]
@@ -282,6 +594,11 @@ def _search_best_scale(
 
         # [STEP 3]: Compute output of module
         module_kwargs = self._sanitize_kwargs(kwargs, module2inspect)
+        global_kwargs = getattr(self, "_module_forward_kwargs", {})
+        global_allowed_kwargs = self._sanitize_kwargs(global_kwargs, module2inspect)
+        for key, value in global_allowed_kwargs.items():
+            module_kwargs.setdefault(key, value)
+
         with ctx(torch.inference_mode()):
             fp16_output = self._module_forward(inp, module2inspect, module_kwargs)
 
@@ -299,76 +616,6 @@ def _search_best_scale(
             loss
         )
 
-    # The module here is model.layers[x]
-    def layer_quantize(self, module: Module, device: torch.device, named_childs: Dict[str, NamedModule]):
-        start = time.time()
-        common_device = device
-
-        self.inps = self.inps.to(common_device)
-
-        # TODO: why do we need this?
-        # We need to move the rotary embedding every time we move to a new module.
-        # Transformers 4.45.0 moved rotary embedding to model definition as of this PR:
-        # https://github.com/huggingface/transformers/pull/32617
-        # self.gptq_model.move_embed(common_device)
-
-        # Transformers >= 4.48.0 requires positional embeddings should be computed before forward pass
-        if self.module_kwargs.get("position_embeddings") is None:
-            self.module_kwargs["position_embeddings"] = self.model.model.rotary_emb(
-                self.inps, self.module_kwargs["position_ids"]
-            )
-
-        # TODO FIX ME: ???
-        if (self.module_kwargs.get('attention_mask') is None):
-            self.module_kwargs['attention_mask'] = None
-
-        for k, v in self.module_kwargs.items():
-            # position embeddings found in tuple
-            if isinstance(v, tuple):
-                self.module_kwargs[k] = tuple(
-                    item.to(common_device) if isinstance(item, (torch.Tensor, nn.Module))
-                    else item for item in v
-                )
-
-        # [STEP 1]: Get layer, extract linear modules, extract input features
-        # named_linears = get_named_linears(module)
-        named_linears = {name: m.module for name, m in named_childs.items()}
-
-        # TODO quant_config.modules_to_not_convert
-        # Filter out the linear layers we don't want to exclude
-        # named_linears = exclude_layers_to_not_quantize(
-        #     named_linears, self.modules_to_not_convert
-        # )
-
-        input_feat = self._get_input_feat(module, named_linears)
-
-        # [STEP 2]: Compute and apply scale list
-        module_config: List[Dict] = self.gptq_model.awq_get_modules_for_scaling(
-            module, input_feat, self.module_kwargs
-        )
-        scales_list = [
-            self._search_best_scale(module, **layer)
-            for layer in module_config
-        ]
-        apply_scale(module, scales_list, input_feat_dict=input_feat)
-        scales_list = append_str_prefix(
-            scales_list, get_op_name(self.model, module) + "."
-        )
-
-        # [STEP 3]: Compute and apply clipping list
-        if self.apply_clip:
-            clip_list = self._search_best_clip(
-                module, named_linears, input_feat
-            )
-            apply_clip(module, clip_list)
-            clip_list = append_str_prefix(
-                clip_list, get_op_name(self.model, module) + "."
-            )
-
-        # [STEP 4]: Quantize weights
-        if not self.export_compatible:
-            self._apply_quant(module, named_childs, start, scales_list)
-
     @torch.inference_mode()
     def _search_best_clip(self, layer, named_linears, input_feat):
         clip_list = []
@@ -448,7 +695,7 @@ def _compute_best_clip(
         del org_out
 
         return best_max_val.squeeze(1)
-    
+
     def pseudo_quantize_tensor(self, w: torch.Tensor):
         org_w_shape = w.shape
         if self.qcfg.group_size > 0:
@@ -517,6 +764,10 @@ def _compute_best_scale(
         x_mean = x_mean.view(-1).to(device)
         w_mean = w_mean.view(-1).to(device)
 
+        prev_scale_hint = getattr(self._scale_context, "prev_scale", None)
+        if prev_scale_hint is not None:
+            w_mean = w_mean * float(prev_scale_hint)
+
         for ratio in range(n_grid):
             # create new scales
             ratio = ratio / n_grid
@@ -598,25 +849,103 @@ def _compute_loss(
     def _module_forward(
             self, x: torch.Tensor, module: torch.nn.Module, module_kwargs: Dict
     ) -> torch.Tensor:
-        if self.n_parallel_calib_samples is None:
-            # runs through all samples at once
-            module_output = module(x, **module_kwargs)
-            if isinstance(module_output, tuple):
-                module_output = module_output[0]
+        target_device = None
+        try:
+            target_device = next(module.parameters()).device
+        except StopIteration:
+            target_device = None
+        except Exception:
+            target_device = None
+
+        for key, value in list(module_kwargs.items()):
+            if isinstance(value, torch.Tensor):
+                if target_device is not None and value.device != target_device:
+                    module_kwargs[key] = value.to(target_device)
+            elif isinstance(value, (list, tuple)):
+                converted = []
+                changed = False
+                for item in value:
+                    if isinstance(item, torch.Tensor) and target_device is not None and item.device != target_device:
+                        converted.append(item.to(target_device))
+                        changed = True
+                    else:
+                        converted.append(item)
+                if changed:
+                    module_kwargs[key] = type(value)(converted)
+
+        seq_len = None
+        batch_dim = None
+        if x.dim() >= 2:
+            batch_dim = x.shape[0]
+            seq_len = x.shape[1]
         else:
-            # memory efficiently runs through all calibration samples
-            # but only n_parallel_calib_samples at a time
-            module_output = []
-            partitioned_inputs = torch.split(x, self.n_parallel_calib_samples)
-            for x_partial in partitioned_inputs:
-                partial_output = module(x_partial, **module_kwargs)
+            batch_dim = 1
+            seq_len = x.shape[0]
 
-                if isinstance(partial_output, tuple):
-                    partial_output = partial_output[0]
+        supports_position_ids = False
+        supports_position_embeddings = False
+        try:
+            signature = inspect.signature(module.forward).parameters
+            supports_position_ids = "position_ids" in signature
+            supports_position_embeddings = "position_embeddings" in signature
+        except (ValueError, TypeError):
+            pass
 
-                module_output.append(partial_output.cpu())
+        rotary = getattr(getattr(self.model, "model", self.model), "rotary_emb", None)
+        if seq_len is not None and rotary is not None and supports_position_embeddings:
+            pos_ids = module_kwargs.get("position_ids") if supports_position_ids else None
+            if not supports_position_ids:
+                pos_ids = None
+            if pos_ids is None or pos_ids.shape[-1] != seq_len:
+                pos_values = torch.arange(seq_len, device=target_device or x.device, dtype=torch.long)
+                if x.dim() >= 2:
+                    pos_values = pos_values.unsqueeze(0).expand(batch_dim, -1)
+                if supports_position_ids:
+                    module_kwargs["position_ids"] = pos_values
+                pos_for_rotary = pos_values
+            else:
+                pos_for_rotary = pos_ids.to(target_device or pos_ids.device)
+                if supports_position_ids:
+                    module_kwargs["position_ids"] = pos_for_rotary
+
+            x_for_rotary = x if target_device is None else x.to(target_device)
+            module_kwargs["position_embeddings"] = rotary(x_for_rotary, pos_for_rotary)
+        elif supports_position_ids and seq_len is not None and "position_ids" not in module_kwargs:
+            pos_values = torch.arange(seq_len, device=target_device or x.device, dtype=torch.long)
+            if x.dim() >= 2:
+                pos_values = pos_values.unsqueeze(0).expand(batch_dim, -1)
+            module_kwargs["position_ids"] = pos_values
+
+        if self._quant_batch_size is None or self._quant_batch_size <= 1:
+            module_output = module(x, **module_kwargs)
+            if isinstance(module_output, tuple):
+                module_output = module_output[0]
+            return module_output
+
+        def _slice_value(val, length):
+            if isinstance(val, torch.Tensor) and val.shape[0] == module_kwargs.get("position_ids", val).shape[0]:
+                return val[:length]
+            if isinstance(val, torch.Tensor) and val.shape[0] != length:
+                return val
+            if isinstance(val, torch.Tensor):
+                return val[:length]
+            if isinstance(val, (list, tuple)):
+                sliced = [_slice_value(item, length) for item in val]
+                return type(val)(sliced)
+            return val
+
+        outputs = []
+        for x_partial in torch.split(x, self._quant_batch_size, dim=0):
+            partial_kwargs = {
+                key: _slice_value(value, x_partial.shape[0])
+                for key, value in module_kwargs.items()
+            }
+            partial_output = module(x_partial, **partial_kwargs)
+            if isinstance(partial_output, tuple):
+                partial_output = partial_output[0]
+            outputs.append(partial_output)
 
-            module_output = torch.cat(module_output, dim=0)
+        module_output = torch.cat(outputs, dim=0)
 
         return module_output
 
@@ -729,7 +1058,7 @@ def _apply_quant(self, module, named_linears: Dict[str, NamedModule], start_time
                 MODULE_FEATURE_COLUMN: self.module_feature_summary(named_module),
                 DTYPE_SIZE_COLUMN: self.module_dtype_size_summary(named_module),
                 QUANT_LOG_LOSS: loss_summary,
-                QUANT_LOG_NSAMPLES: f"{self.nsamples}",
+                QUANT_LOG_NSAMPLES: f"{self._nsamples_total}",
                 # QUANT_LOG_DAMP: f"{damp_percent:.5f}",
                 PROCESS_LOG_TIME: f"{duration:.3f}",
                 # PROCESS_LOG_FWD_TIME: f"{self.fwd_time:.3f}",
@@ -742,6 +1071,17 @@ def _apply_quant(self, module, named_linears: Dict[str, NamedModule], start_time
             # Log the new row
             self.log_new_row(stat)
 
+            # Mirror GPTQ-style visibility in the CLI so awq modules show up
+            # even when the table view is busy with progress updates.
+            log.info(
+                "awq | layer=%s module=%s loss=%s samples=%s time=%ss",
+                named_module.layer_index,
+                named_module.name,
+                loss_summary,
+                self._nsamples_total,
+                f"{duration:.3f}",
+            )
+
     def _sanitize_kwargs(self, inputs_kwargs, module):
         """
         Remove the arguments that are not supported in the module's
@@ -761,30 +1101,92 @@ def _sanitize_kwargs(self, inputs_kwargs, module):
                 sanitized_kwargs[k] = v
         return sanitized_kwargs
 
-    def preprocess(self, module: NamedModule, fail_safe: bool):
-        # TODO Dynamic is not yet supported
-        pass
+    def preprocess(self, module: NamedModule, fail_safe: Optional[bool] = None):
+        layer_state = self._get_layer_state(module.layer_index)
+        with layer_state.lock:
+            layer_state.modules[module.name] = module
+            layer_module_ref = module.state.get("layer_module")
+            if layer_state.layer_module is None and layer_module_ref is not None:
+                layer_state.layer_module = layer_module_ref
+            effective_layer = layer_state.layer_module or layer_module_ref
+            if not layer_state.pending_modules and effective_layer is not None:
+                try:
+                    all_linears = find_modules(effective_layer)
+                except Exception:
+                    all_linears = {}
+                layer_state.pending_modules.update(all_linears.keys())
+            layer_state.pending_modules.add(module.name)
+        with self.lock:
+            entry = self.tasks.get(module.name)
+            if entry is None:
+                self.tasks[module.name] = {"inputs": []}
+            else:
+                entry.setdefault("inputs", [])
 
     def is_skipped(self, module: NamedModule) -> bool:
-        # TODO Dynamic is not yet supported
-        # gptq has no dynamic method of full override (removal)
-        # t = self.tasks.get(module.name, False)
-        # if t == False:
-        #     return True
-        # else:
-        #     return False
-        pass
+        return False
 
     def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tensor, ...], torch.Tensor], None]:
-        pass
+        def hook(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
+            if not inp:
+                return
+            feature = inp[0]
+            if isinstance(feature, (tuple, list)) and feature:
+                feature = feature[0]
+            self._record_input_feature(name, feature)
+        return hook
+
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
+        self._refresh_forward_kwargs_from_cache()
+        layer_index = module.layer_index
+        state = self._get_layer_state(layer_index)
+
+        with state.lock:
+            if subset is not None:
+                state.modules.update(subset)
+                if state.layer_module is None:
+                    for candidate in subset.values():
+                        layer_module_ref = candidate.state.get("layer_module")
+                        if layer_module_ref is not None:
+                            state.layer_module = layer_module_ref
+                            break
+
+            if subset_total is not None:
+                state.subset_total = subset_total
+            if subset_index is not None:
+                state.processed_subsets.add(subset_index)
+
+            if module is not None:
+                state.pending_modules.discard(module.name)
+
+            if previous_subset:
+                state.previous_weight_scale = self._capture_previous_subset_scale(previous_subset)
+
+            should_quantize = (
+                not state.quantized
+                and bool(state.modules)
+                and (
+                    not state.pending_modules
+                    or (
+                        state.subset_total is not None
+                        and len(state.processed_subsets) >= state.subset_total
+                    )
+                )
+            )
 
-    def process(self, module: NamedModule):
-        # awq uses model.layers[0] for quantization instead of model.layers.0.self_attn.q_proj
-        # This method will not be called.
-        pass
+        if should_quantize:
+            self._quantize_layer(layer_index, state)
 
     # submodule_finalized is called in reverse after all next sequential processes are called
-    def submodule_finalize(self, module: NamedModule, **kwargs):
+    def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
         # generate complete, safe to move to cpu
         module.weight.data = move_to(module.weight.data, device=CPU) # large weights is slow to init on cpu
         module.state.pop("w", None) # no need for original weights now
diff --git a/gptqmodel/looper/dequantize_processor.py b/gptqmodel/looper/dequantize_processor.py
index 50475970a..138c70833 100644
--- a/gptqmodel/looper/dequantize_processor.py
+++ b/gptqmodel/looper/dequantize_processor.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 
@@ -28,7 +28,15 @@ def set_calibration_dataset(self, calibration_dataset):
         self.num_batches = 0
 
     # de-quantize weights
-    def process(self, module: NamedModule):
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
         device = module.weight.device
 
         # TODO fix num_itr param..need to calculate this before dequant
diff --git a/gptqmodel/looper/eora_processor.py b/gptqmodel/looper/eora_processor.py
index 893b805ef..b31737cdf 100644
--- a/gptqmodel/looper/eora_processor.py
+++ b/gptqmodel/looper/eora_processor.py
@@ -195,7 +195,15 @@ def _finalize_eigen_scaling_matrix(self, name: str) -> torch.Tensor:
 
         return merge_eora_segments(segment_pairs)
 
-    def process(self, module: NamedModule):
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
         assert isinstance(module.adapter_cfg, Lora)
 
         self.pb.title(f"EoRA: Processing {module.name} ({module.module_dtype}) in layer").draw()
diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
index c0622ad43..ca843dc96 100644
--- a/gptqmodel/looper/gptq_processor.py
+++ b/gptqmodel/looper/gptq_processor.py
@@ -7,7 +7,7 @@
 import copy
 import threading
 import time
-from typing import Callable, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 
 import torch
 from torch.nn import Module
@@ -104,7 +104,15 @@ def tmp(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
             del inp, out
         return tmp
 
-    def process(self, module: NamedModule):
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
         # Reset peak memory stats
         #torch.cuda.reset_peak_memory_stats()
         self.pb.title(f"Quantizing {module.name} in layer ").draw()
diff --git a/gptqmodel/looper/loop_processor.py b/gptqmodel/looper/loop_processor.py
index 2d6b3b476..c995208ca 100644
--- a/gptqmodel/looper/loop_processor.py
+++ b/gptqmodel/looper/loop_processor.py
@@ -545,7 +545,15 @@ def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tenso
         pass
 
     # do work and return processor.self state which will updated/merged
-    def process(self, module: NamedModule, device: torch.device = None):
+    def process(
+            self,
+            module: NamedModule,
+            device: torch.device = None,
+            subset: Optional[Dict[str, NamedModule]] = None,
+            previous_subset: Optional[Dict[str, NamedModule]] = None,
+            subset_index: Optional[int] = None,
+            subset_total: Optional[int] = None,
+    ):
         pass
 
     # last step, after all loop processor is called
diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
index 85ccea470..054cff7ea 100644
--- a/gptqmodel/looper/module_looper.py
+++ b/gptqmodel/looper/module_looper.py
@@ -1030,7 +1030,12 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):
         if region_timer is not None:
             region_timer.flush()
 
-        layer_modules = self.gptq_model.simple_layer_modules(model_config=self.gptq_model.model.config, quantize_config=self.gptq_model.quantize_config)
+        is_awq_quantize = any(isinstance(proc, AWQProcessor) for proc in self.processors)
+        layer_modules = self.gptq_model.simple_layer_modules(
+            model_config=self.gptq_model.model.config,
+            quantize_config=self.gptq_model.quantize_config,
+            is_awq_quantize=is_awq_quantize,
+        )
 
         # true-sequential will replay the quantized activations after each subset has been quantized to be used for next subset quantization
         # this should always be true for gptq unless you want lower but misleading error_loss that is misleading and will lead to lower post-quantized model
@@ -1143,8 +1148,7 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):
 
         return total_log
 
-    def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe) -> Dict[str, NamedModule]:
-        is_awq_quant = isinstance(processor, AWQProcessor)
+    def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe, layer_module=None) -> Dict[str, NamedModule]:
         subset = {}
         for n in names:
             if n in full:
@@ -1168,20 +1172,20 @@ def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefi
 
                 subset[name] = named_module
                 full[name] = named_module
+                if layer_module is not None:
+                    named_module.state.setdefault("layer_module", layer_module)
 
-            if not is_awq_quant:
-                if isinstance(processor, GPTQProcessor):
-                    processor.preprocess(subset[name], fail_safe=fail_safe)
-                else:
-                    processor.preprocess(subset[name])
-                # some modules are skipped
-                if processor.is_skipped(subset[name]):
-                    skipped_modules.append(name)
-
-        if not is_awq_quant:
-            for name in skipped_modules:
-                subset.pop(name)
-                task_map = getattr(processor, "tasks", None)
-                if task_map is not None:
-                    task_map.pop(name, None)
+            if isinstance(processor, GPTQProcessor):
+                processor.preprocess(subset[name], fail_safe=fail_safe)
+            else:
+                processor.preprocess(subset[name])
+            # some modules are skipped
+            if processor.is_skipped(subset[name]):
+                skipped_modules.append(name)
+
+        for name in skipped_modules:
+            subset.pop(name)
+            task_map = getattr(processor, "tasks", None)
+            if task_map is not None:
+                task_map.pop(name, None)
         return subset
diff --git a/gptqmodel/looper/native_processor.py b/gptqmodel/looper/native_processor.py
index ed1d027f4..9d5ad255c 100644
--- a/gptqmodel/looper/native_processor.py
+++ b/gptqmodel/looper/native_processor.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
-from typing import Callable, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 
 import torch
 from torch.nn import Module
@@ -66,7 +66,15 @@ def tmp(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
 
         return tmp
 
-    def process(self, module: NamedModule):
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
         module.state[NATIVE_INPUTS_STATE_KEY] = self.native_inp_caches.pop(module.name)
 
     def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
diff --git a/gptqmodel/looper/qqq_processor.py b/gptqmodel/looper/qqq_processor.py
index 365127126..319e390b8 100644
--- a/gptqmodel/looper/qqq_processor.py
+++ b/gptqmodel/looper/qqq_processor.py
@@ -5,7 +5,7 @@
 
 import contextlib
 import copy
-from typing import Callable, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 
 import torch
 from torch.nn import Module
@@ -100,7 +100,15 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
             q.add_batch(inp[0].data, out.data)  # noqa: F821
         return tmp
 
-    def process(self, module: NamedModule):
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
         self.pb.title(f"Quantizing {module.name} in layer ").draw()
         qqq = self.tasks
 
diff --git a/gptqmodel/looper/stage_layer.py b/gptqmodel/looper/stage_layer.py
index 49dc14df8..972abc295 100644
--- a/gptqmodel/looper/stage_layer.py
+++ b/gptqmodel/looper/stage_layer.py
@@ -7,10 +7,10 @@
 
 from __future__ import annotations
 
+import logging
 import threading
 import time
 from concurrent.futures import as_completed
-from contextlib import nullcontext
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
@@ -20,10 +20,8 @@
 from ..looper.gptq_processor import GPTQProcessor
 from ..looper.named_module import NamedModule
 from ..looper.qqq_processor import QQQProcessor
-from ..utils.ctx import ctx
 from ..utils.device import get_device, get_device_new
 from ..utils.logger import log_time_block, setup_logger
-from ..utils.looper_helpers import device_ctx
 from ..utils.model import find_modules, get_module
 from ..utils.offload import offload_to_disk
 from ..utils.torch import CPU, torch_sync
@@ -90,37 +88,6 @@ def run_layer_stage(
                 # merge all subsets into one
                 modules = [sum(modules, [])]
 
-            # AWQ does per-layer itself; skip here
-            if isinstance(processor, AWQProcessor):
-                named_childs = dict()
-                for index, names in enumerate(modules):
-                    named_modules = looper.crate_named_modules(full=full,
-                                                             is_lm_head_module=is_lm_head_module,
-                                                             layer_index=layer_index, layers_prefix=layers_prefix,
-                                                             names=names,
-                                                             processor=processor,
-                                                             fail_safe=fail_safe)
-                    named_childs.update(named_modules)
-
-                lock_ctx = nullcontext()
-                device_for_ctx = cur_layer_device if getattr(cur_layer_device, 'type', None) != 'meta' else None
-                if device_for_ctx is not None:
-                    lock_ctx = DEVICE_THREAD_POOL.read_lock(cur_layer_device)
-                with ctx(lock_ctx, device_ctx(device_for_ctx)):
-                    processor.layer_quantize(module, cur_layer_device, named_childs)
-                if p_index == len(looper.processors) - 1:
-                    looper._emit_layer_complete(
-                        layer_idx=layer_index,
-                        submodule_finalized=False,
-                        raise_in_place=True,
-                    )
-                    looper._emit_layer_complete(
-                        layer_idx=layer_index,
-                        submodule_finalized=True,
-                        raise_in_place=True,
-                    )
-                continue
-
             layer_inputs = processor.inputs_cache.layer_inputs
             if is_lm_head_module:
                 layer_inputs = looper.gptq_model.lm_head_pre_quantize_generate_hook(layer_inputs)
@@ -131,8 +98,28 @@ def run_layer_stage(
             processed_subset: Dict[str, NamedModule] = {}
             last_subset_context: Optional[SubsetForwardContext] = None
             subset_total = len(modules)
+            previous_subset_processed: Optional[Dict[str, NamedModule]] = None
 
             for index, names in enumerate(modules):
+                if isinstance(processor, AWQProcessor):
+                    log.info(
+                        "StageLayer[awq]: layer=%s subset=%s/%s size=%s names=%s",
+                        layer_index,
+                        index + 1,
+                        subset_total,
+                        len(names),
+                        names[:5],
+                    )
+                elif log.isEnabledFor(logging.DEBUG):
+                    log.debug(
+                        "StageLayer: layer=%s subset=%s/%s processor=%s size=%s names=%s",
+                        layer_index,
+                        index + 1,
+                        subset_total,
+                        processor.name(),
+                        len(names),
+                        names[:8],
+                    )
                 subset_result = run_subset_stage(
                     looper,
                     processor=processor,
@@ -156,10 +143,12 @@ def run_layer_stage(
                     pb=pb,
                     log=log,
                     region_timer=region_timer,
+                    previous_processed_subset=previous_subset_processed,
                 )
 
                 layer_inputs = subset_result.layer_inputs
                 processed_subset.update(subset_result.processed_subset)
+                previous_subset_processed = subset_result.processed_subset
                 if subset_result.forward_context is not None:
                     last_subset_context = subset_result.forward_context
 
diff --git a/gptqmodel/looper/stage_subset.py b/gptqmodel/looper/stage_subset.py
index befcc6dd1..63e103c58 100644
--- a/gptqmodel/looper/stage_subset.py
+++ b/gptqmodel/looper/stage_subset.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+import logging
 import math
 import time
 from dataclasses import dataclass
@@ -67,10 +68,15 @@ def run_subset_stage(
     pb,
     log=None,
     region_timer=None,
+    previous_processed_subset: Optional[Dict[str, NamedModule]] = None,
 ) -> SubsetStageResult:
     """Process a single subset of modules within the layer quantization loop."""
     logger = log or setup_logger()
 
+    processor_name = processor.name() if hasattr(processor, "name") else type(processor).__name__
+    processor_name_lower = processor_name.lower()
+    is_awq_processor = processor_name_lower.startswith("awq")
+
     subset = looper.crate_named_modules(
         full=full,
         is_lm_head_module=is_lm_head_module,
@@ -79,11 +85,41 @@ def run_subset_stage(
         names=subset_names,
         processor=processor,
         fail_safe=fail_safe,
+        layer_module=module,
     )
 
     if len(subset) == 0:
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                "StageSubset: layer=%s subset=%s/%s processor=%s produced empty subset (names=%s)",
+                layer_index,
+                subset_index + 1,
+                subset_total,
+                processor_name,
+                subset_names,
+            )
         return SubsetStageResult(processed_subset={}, layer_inputs=layer_inputs, forward_context=None)
 
+    if is_awq_processor:
+        logger.info(
+            "StageSubset[awq]: layer=%s subset=%s/%s modules=%s sample=%s",
+            layer_index,
+            subset_index + 1,
+            subset_total,
+            len(subset),
+            list(subset.keys())[:8],
+        )
+    elif logger.isEnabledFor(logging.DEBUG):
+        logger.debug(
+            "StageSubset: layer=%s subset=%s/%s processor=%s created %s modules (sample=%s)",
+            layer_index,
+            subset_index + 1,
+            subset_total,
+            processor_name,
+            len(subset),
+            list(subset.keys())[:8],
+        )
+
     moe_group_keys_all: List[str] = []
     forward_device_map: Dict[str, torch.device] = {}
     subset_forward_serial = False
@@ -190,6 +226,24 @@ def run_subset_stage(
                 looper._masked_hook_wrapper(processor, original_hook, hook_source)
             ))
 
+    if is_awq_processor:
+        logger.info(
+            "StageSubset[awq]: layer=%s subset=%s/%s registering hooks for %s modules",
+            layer_index,
+            subset_index + 1,
+            subset_total,
+            len(subset),
+        )
+    elif logger.isEnabledFor(logging.DEBUG):
+        logger.debug(
+            "StageSubset: layer=%s subset=%s/%s processor=%s registering hooks for %s modules",
+            layer_index,
+            subset_index + 1,
+            subset_total,
+            processor_name,
+            len(subset),
+        )
+
     fwd_start = time.perf_counter()
     forward_source = f"{layer_descriptor}:subset{subset_index + 1}/{subset_total}"
 
@@ -313,8 +367,13 @@ def _process_on_worker(
         proc: LoopProcessor,
         nm: NamedModule,
         expected_device: torch.device,
+        subset_ref: Dict[str, NamedModule],
+        previous_subset_ref: Optional[Dict[str, NamedModule]],
+        subset_idx: int,
+        subset_total_count: int,
     ):
         module_label = getattr(nm, "full_name", getattr(nm, "name", repr(nm)))
+        proc_name = proc.name() if hasattr(proc, "name") else type(proc).__name__
         module_ref = nm.module if isinstance(nm, NamedModule) else nm
         module_weight = getattr(module_ref, "weight", None)
         if module_weight is not None and expected_device is not None:
@@ -328,7 +387,33 @@ def _process_on_worker(
         timer = getattr(looper.gptq_model, "quant_region_timer", None)
         start = time.perf_counter() if timer else None
         try:
-            proc.process(module=nm)
+            if is_awq_processor:
+                logger.info(
+                    "StageSubsetWorker[awq]: layer=%s subset=%s/%s module=%s previous_subset=%s",
+                    getattr(nm, "layer_index", None),
+                    subset_idx + 1,
+                    subset_total_count,
+                    module_label,
+                    bool(previous_subset_ref),
+                )
+            elif logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "StageSubsetWorker: processor=%s layer=%s subset=%s/%s module=%s running on %s (previous_subset=%s)",
+                    proc_name,
+                    getattr(nm, "layer_index", None),
+                    subset_idx + 1,
+                    subset_total_count,
+                    module_label,
+                    expected_device,
+                    bool(previous_subset_ref),
+                )
+            proc.process(
+                module=nm,
+                subset=subset_ref,
+                previous_subset=previous_subset_ref,
+                subset_index=subset_idx,
+                subset_total=subset_total_count,
+            )
         finally:
             if timer is not None and start is not None:
                 timer.record(
@@ -341,7 +426,17 @@ def _process_on_worker(
     for name, named_module in subset.items():
         tgt_dev = quant_target_devices.get(name, cur_layer_device)
         futures.append(
-            DEVICE_THREAD_POOL.submit(tgt_dev, _process_on_worker, processor, named_module, tgt_dev)
+            DEVICE_THREAD_POOL.submit(
+                tgt_dev,
+                _process_on_worker,
+                processor,
+                named_module,
+                tgt_dev,
+                subset,
+                previous_processed_subset,
+                subset_index,
+                subset_total,
+            )
         )
 
     for fut in futures:
diff --git a/gptqmodel/looper/tensorparallel_weight_processor.py b/gptqmodel/looper/tensorparallel_weight_processor.py
index e4b0270c0..fb60cc3aa 100644
--- a/gptqmodel/looper/tensorparallel_weight_processor.py
+++ b/gptqmodel/looper/tensorparallel_weight_processor.py
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 import math
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 
@@ -60,7 +60,15 @@ def _noop(module, inputs, output):
 
         return _noop
 
-    def process(self, module: NamedModule):
+    def process(
+        self,
+        module: NamedModule,
+        device: torch.device = None,
+        subset: Optional[Dict[str, NamedModule]] = None,
+        previous_subset: Optional[Dict[str, NamedModule]] = None,
+        subset_index: Optional[int] = None,
+        subset_total: Optional[int] = None,
+    ):
         target = module.module if isinstance(module, NamedModule) else module
         weight = getattr(target, "weight", None)
         if weight is None:
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index bcd1ff641..27508d769 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -1334,6 +1334,32 @@ def strip_not_quantize_flag(module_name):
             else:
                 return module_name
 
+        def _select_feature_name(names):
+            """Return the first quantized child that has captured activations."""
+            for raw in names:
+                stripped = strip_not_quantize_flag(raw)
+                if stripped in input_feat:
+                    return stripped
+            return strip_not_quantize_flag(names[0]) if names else None
+
+        def _try_update_last_module(candidate_name: str) -> bool:
+            nonlocal last_module, last_module_name, last_module_root
+
+            resolved_module, _ = get_module_by_name_prefix(module, candidate_name)
+            if resolved_module is None:
+                log.debug(
+                    "awq_get_modules_for_scaling: last-module candidate `%s` missing; retaining previous `%s`",
+                    candidate_name,
+                    last_module_name,
+                )
+                return False
+
+            last_module = resolved_module
+            last_module_name = candidate_name
+            if "." in candidate_name:
+                last_module_root = candidate_name.split(".", 1)[0]
+            return True
+
         full_layer_modules = self.full_layer_modules(self.model.config, is_awq_quantize=True)
         for i, block in enumerate(full_layer_modules):
             not_quantized = all(NOT_QUANTIZE_FLAG in name for name in block)
@@ -1345,8 +1371,8 @@ def strip_not_quantize_flag(module_name):
                     continue
 
                 # Remember the latest norm (use the last entry if multiple are present)
-                last_module_name = strip_not_quantize_flag(block[-1])
-                last_module, _ = get_module_by_name_prefix(module, last_module_name)
+                candidate_name = strip_not_quantize_flag(block[-1])
+                _try_update_last_module(candidate_name)
                 continue
 
             if num_experts is not None and len(block) == num_experts and last_module is not None and last_module_name is not None:
@@ -1355,9 +1381,14 @@ def strip_not_quantize_flag(module_name):
                 for name in block:
                     prev_op_name = ".".join(name.split(".")[:-1] + [target_suffix])
                     prev_op, _ = get_module_by_name_prefix(module, prev_op_name)
-                    assert prev_op is not None
+                    if prev_op is None or name not in input_feat:
+                        log.debug("awq_get_modules_for_scaling: skipping expert `%s` due to missing prev_op or features", name)
+                        continue
 
                     m, _ = get_module_by_name_prefix(module, name)
+                    if m is None:
+                        log.debug("awq_get_modules_for_scaling: skipping missing expert module `%s`", name)
+                        continue
                     subset = [m]
                     n, root = generate_node_for_awq_scaling(inp=input_feat[name], prev_op=prev_op,
                                                             module_kwargs=module_kwargs, nodes_size=len(nodes),
@@ -1368,7 +1399,7 @@ def strip_not_quantize_flag(module_name):
                     nodes.append(n)
             else:
                 # Normal execution subset
-                subset = []
+                subset = []  # preserve execution order while collecting quantizable modules
                 skip = False
                 for name in block:
                     if NOT_QUANTIZE_FLAG not in name:
@@ -1382,16 +1413,23 @@ def strip_not_quantize_flag(module_name):
                             # log.debug(f'"{name}" attention out skipped.')
                             skip = True
 
+                        if m is None:
+                            log.debug("awq_get_modules_for_scaling: skipping missing module `%s`", name)
+                            skip = True
+                            break
                         subset.append(m)
 
-                if skip:
+                if skip or not subset:
                     continue
 
-                assert len(subset) > 0
                 prev_op = last_module
-                assert prev_op is not None
+                if prev_op is None:
+                    log.debug("awq_get_modules_for_scaling: skipping block %s due to missing previous module", block)
+                    continue
 
-                root_split = block[0].split(".")
+                # Match the activation bucket to the first quantized child in this block
+                feature_name = _select_feature_name(block) or strip_not_quantize_flag(block[0])
+                root_split = feature_name.split(".")
                 module2inspect = None
                 if len(root_split) >= 2:
                     root = root_split[0]
@@ -1400,9 +1438,19 @@ def strip_not_quantize_flag(module_name):
                         module2inspect, _ = get_module_by_name_prefix(module, root)
 
                 if num_experts is not None and len(block) == 2 * num_experts and module2inspect is not None:
-                    inp = input_feat[last_module_root]
+                    if last_module_root not in input_feat:
+                        log.debug(
+                            "awq_get_modules_for_scaling: missing input feature for `%s` while processing experts block (layer block size=%s)",
+                            last_module_root,
+                            len(block),
+                        )
+                    inp = input_feat.get(last_module_root, input_feat.get(_select_feature_name(block)))
                 else:
-                    inp = input_feat[block[0]]
+                    inp = input_feat.get(_select_feature_name(block))
+
+                if inp is None:
+                    log.debug("awq_get_modules_for_scaling: skipping block %s due to missing input features", block)
+                    continue
 
                 n, root = generate_node_for_awq_scaling(inp=inp, prev_op=prev_op,
                                                         module_kwargs=module_kwargs, nodes_size=len(nodes),
@@ -1411,8 +1459,8 @@ def strip_not_quantize_flag(module_name):
                 nodes.append(n)
 
             # Update tracker to the LAST item of this block
-            last_module_name = strip_not_quantize_flag(block[-1])
-            last_module, _ = get_module_by_name_prefix(module, last_module_name)
+            candidate_name = strip_not_quantize_flag(block[-1])
+            _try_update_last_module(candidate_name)
 
         import torch
         def format_nodes(nodes):
diff --git a/gptqmodel/utils/nogil_patcher.py b/gptqmodel/utils/nogil_patcher.py
index e2779f603..e6b85d42f 100644
--- a/gptqmodel/utils/nogil_patcher.py
+++ b/gptqmodel/utils/nogil_patcher.py
@@ -7,10 +7,11 @@
 
 import threading
 import time
+from importlib.metadata import version
+
+from packaging.version import InvalidVersion, Version
 
 from .safe import ThreadSafe
-from importlib.metadata import version
-from packaging.version import Version, InvalidVersion
 
 
 _PATCHED_ATTR = "_gptqmodel_locked_save_file"
@@ -37,7 +38,7 @@ def patch_safetensors_save_file() -> None:
 
 def patch_triton_autotuner() -> None:
     try:
-        import triton
+        import triton # noqa
         from triton.runtime import autotuner as module
     except ImportError:
         return
diff --git a/scripts/eval_model.py b/scripts/eval_model.py
new file mode 100644
index 000000000..f6c3d3a3f
--- /dev/null
+++ b/scripts/eval_model.py
@@ -0,0 +1,325 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 gptqmodel contributors
+# SPDX-License-Identifier: Apache-2.0
+"""CLI helper to run lm-eval tasks against a GPTQModel checkpoint."""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+import gptqmodel
+from tabulate import tabulate
+
+from gptqmodel.models.base import BaseQModel
+from gptqmodel.utils.eval import EVAL
+
+
+if sys.platform == "darwin":
+    os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
+
+os.environ.setdefault("CUDA_DEVICE_ORDER", "PCI_BUS_ID")
+os.environ.setdefault(
+    "PYTORCH_ALLOC_CONF",
+    "expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7",
+)
+
+DEFAULT_RESULTS_PATH = Path("lm_eval_results.json")
+DEFAULT_TASKS = (EVAL.LM_EVAL.ARC_CHALLENGE,)
+DEFAULT_TASK_MANAGER_PATH = Path(__file__).resolve().parent.parent / "tests" / "tasks"
+
+
+def _available_backends() -> Dict[str, gptqmodel.BACKEND]:
+    return {member.name.lower(): member for member in gptqmodel.BACKEND}
+
+
+def _parse_backend(value: str) -> gptqmodel.BACKEND:
+    lookup = _available_backends()
+    key = value.strip().lower()
+    if key not in lookup:
+        expected = ", ".join(sorted(lookup.keys()))
+        raise argparse.ArgumentTypeError(f"Unknown backend '{value}'. Expected one of: {expected}")
+    return lookup[key]
+
+
+def _parse_batch_size(value: str) -> str | int:
+    normalized = value.strip()
+    if normalized.lower() == "auto":
+        return "auto"
+    try:
+        return int(normalized, 10)
+    except ValueError as exc:
+        raise argparse.ArgumentTypeError("Batch size must be 'auto' or an integer") from exc
+
+
+def _coerce_value(text: str):
+    lowered = text.lower()
+    if lowered in {"true", "false"}:
+        return lowered == "true"
+    try:
+        if "." in text:
+            return float(text)
+        return int(text, 10)
+    except ValueError:
+        return text
+
+
+def _parse_key_value_pairs(pairs: Iterable[str]) -> Dict[str, object]:
+    result: Dict[str, object] = {}
+    for item in pairs:
+        if "=" not in item:
+            raise argparse.ArgumentTypeError(f"Argument '{item}' must be in key=value format")
+        key, raw_value = item.split("=", 1)
+        key = key.strip()
+        if not key:
+            raise argparse.ArgumentTypeError(f"Argument '{item}' is missing a key")
+        value = _coerce_value(raw_value.strip())
+        result[key] = value
+    return result
+
+
+def _resolve_task(name: str) -> EVAL.LM_EVAL:
+    normalized = name.strip()
+    for task in EVAL.LM_EVAL:
+        if normalized.lower() in {task.value.lower(), task.name.lower()}:
+            return task
+    available = ", ".join(task.value for task in EVAL.LM_EVAL)
+    raise argparse.ArgumentTypeError(f"Unknown lm-eval task '{name}'. Expected one of: {available}")
+
+
+def _list_tasks() -> None:
+    rows = [(task.name, task.value) for task in EVAL.LM_EVAL]
+    print(tabulate(rows, headers=["Name", "Identifier"]))
+
+
+def _extract_metrics(results: Dict) -> Dict[str, Dict[str, float]]:
+    aggregated: Dict[str, Dict[str, float]] = {}
+    task_results = results.get("results", {})
+    for task_name, metrics in task_results.items():
+        filtered = {
+            metric: value
+            for metric, value in metrics.items()
+            if metric != "alias" and "stderr" not in metric
+        }
+        aggregated[task_name] = filtered
+    return aggregated
+
+
+def _print_metrics_table(metrics: Dict[str, Dict[str, float]], table_format: str) -> None:
+    rows: List[Tuple[str, str, object]] = []
+    for task_name in sorted(metrics):
+        for metric_name in sorted(metrics[task_name]):
+            rows.append((task_name, metric_name, metrics[task_name][metric_name]))
+    if not rows:
+        print("No metrics to display.")
+        return
+    print(
+        tabulate(
+            rows,
+            headers=["Task", "Metric", "Value"],
+            tablefmt=table_format,
+            floatfmt=".4f",
+        )
+    )
+
+
+def _split_tasks(arg_value: str | None) -> List[str]:
+    if not arg_value:
+        return []
+    return [item.strip() for item in arg_value.split(",") if item.strip()]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run lm-eval tasks against a quantized model loaded via gptqmodel."
+    )
+    parser.add_argument("--model", required=True, help="Model path or Hugging Face repo id.")
+    parser.add_argument(
+        "--backend",
+        default="auto",
+        type=_parse_backend,
+        help="Inference backend to use when loading with gptqmodel.load.",
+    )
+    parser.add_argument(
+        "--tasks",
+        default=",".join(task.value for task in DEFAULT_TASKS),
+        help="Comma-separated lm-eval task identifiers (see --list-tasks).",
+    )
+    parser.add_argument(
+        "--chat-template-tasks",
+        default=None,
+        help="Comma-separated tasks that should apply the model's chat template during evaluation.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        default="auto",
+        type=_parse_batch_size,
+        help="Evaluation batch size passed to lm-eval (integer or 'auto').",
+    )
+    parser.add_argument(
+        "--dtype",
+        default="auto",
+        help="dtype override forwarded to gptqmodel.load (default: auto).",
+    )
+    parser.add_argument(
+        "--gen-kwargs",
+        default=None,
+        help="Generation kwargs forwarded to lm-eval, e.g. 'temperature=0.0,top_k=50'.",
+    )
+    parser.add_argument(
+        "--model-arg",
+        action="append",
+        default=[],
+        metavar="KEY=VALUE",
+        help="Extra model_args forwarded to GPTQModel.eval (repeatable).",
+    )
+    parser.add_argument(
+        "--load-arg",
+        action="append",
+        default=[],
+        metavar="KEY=VALUE",
+        help="Additional keyword arguments passed to gptqmodel.load (repeatable).",
+    )
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Allow loading models that require remote code execution.",
+    )
+    parser.add_argument(
+        "--use-vllm",
+        action="store_true",
+        help="Run evaluation with the vLLM backend instead of the default gptqmodel harness.",
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=None,
+        help="Optional max_model_len passed to vLLM model args.",
+    )
+    parser.add_argument(
+        "--random-seed",
+        type=int,
+        default=898,
+        help="Seed propagated to lm-eval for reproducibility.",
+    )
+    parser.add_argument(
+        "--task-manager-path",
+        type=str,
+        default=str(DEFAULT_TASK_MANAGER_PATH) if DEFAULT_TASK_MANAGER_PATH.exists() else None,
+        help="Optional path containing custom lm-eval tasks.",
+    )
+    parser.add_argument(
+        "--include-default-tasks",
+        action="store_true",
+        help="Include lm-eval's builtin task registry alongside the custom task directory.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=str(DEFAULT_RESULTS_PATH),
+        help="JSON file to store aggregated metrics (use '-' to skip saving).",
+    )
+    parser.add_argument(
+        "--table-format",
+        default="github",
+        help="Tabulate table format (defaults to 'github').",
+    )
+    parser.add_argument(
+        "--list-tasks",
+        action="store_true",
+        help="List supported lm-eval task identifiers and exit.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    if args.list_tasks:
+        _list_tasks()
+        return
+
+    tasks = [_resolve_task(name) for name in _split_tasks(args.tasks)]
+    if not tasks:
+        raise ValueError("No lm-eval tasks specified.")
+    chat_template_tasks = {_resolve_task(name).value for name in _split_tasks(args.chat_template_tasks)}
+
+    llm_backend = "vllm" if args.use_vllm else "gptqmodel"
+    backend: gptqmodel.BACKEND = args.backend
+
+    load_kwargs = _parse_key_value_pairs(args.load_arg)
+    model = GPTQModel.load(
+        args.model,
+        backend=backend,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        **load_kwargs,
+    )
+
+    if not isinstance(model, BaseQModel):
+        raise RuntimeError("Failed to load GPTQModel; received unexpected object type.")
+
+    model_args = _parse_key_value_pairs(args.model_arg)
+    if args.max_model_len is not None:
+        model_args.setdefault("max_model_len", args.max_model_len)
+
+    if args.use_vllm:
+        model_args.setdefault("dtype", "auto")
+        model_args.setdefault("tensor_parallel_size", 1)
+        model_args.setdefault("gpu_memory_utilization", 0.8)
+
+    task_manager = None
+    if args.task_manager_path:
+        task_manager_path = Path(args.task_manager_path).expanduser().resolve()
+        if not task_manager_path.exists():
+            raise FileNotFoundError(f"Task manager path does not exist: {task_manager_path}")
+        from lm_eval.tasks import TaskManager
+
+        task_manager = TaskManager(
+            include_path=str(task_manager_path),
+            include_defaults=args.include_default_tasks,
+        )
+
+    aggregated_metrics: Dict[str, Dict[str, float]] = {}
+
+    grouped_tasks: Dict[bool, List[EVAL.LM_EVAL]] = {}
+    for task in tasks:
+        apply_chat = task.value in chat_template_tasks
+        grouped_tasks.setdefault(apply_chat, []).append(task)
+
+    for apply_chat_template, grouped in grouped_tasks.items():
+        if not grouped:
+            continue
+
+        result = gptqmodel.GPTQModel.eval(
+            model_or_id_or_path=model,
+            tasks=grouped,
+            framework=EVAL.LM_EVAL,
+            batch_size=args.batch_size,
+            trust_remote_code=args.trust_remote_code,
+            output_path=None,
+            llm_backend=llm_backend,
+            backend=backend,
+            random_seed=args.random_seed,
+            model_args=model_args.copy(),
+            gen_kwargs=args.gen_kwargs,
+            apply_chat_template=apply_chat_template,
+            task_manager=task_manager,
+        )
+
+        group_metrics = _extract_metrics(result)
+        aggregated_metrics.update(group_metrics)
+
+    _print_metrics_table(aggregated_metrics, args.table_format)
+
+    if args.output and args.output != "-":
+        output_path = Path(args.output).expanduser()
+        output_path.write_text(json.dumps(aggregated_metrics, indent=2))
+        print(f"Saved aggregated metrics to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/models/test_glm4_moe.py b/tests/models/test_glm4_moe.py
index 02870bb89..082e94f7d 100644
--- a/tests/models/test_glm4_moe.py
+++ b/tests/models/test_glm4_moe.py
@@ -9,6 +9,9 @@
 
 
 class TestGlm4Moe(ModelTest):
+    # FORMAT = FORMAT.GEMM
+    # METHOD = METHOD.AWQ
+
     NATIVE_MODEL_ID = "/monster/data/model/GLM-4.6/"
     DELETE_QUANTIZED_MODEL = False
     DATASET_SIZE = 512
diff --git a/tests/models/test_glm4_moe_awq.py b/tests/models/test_glm4_moe_awq.py
new file mode 100644
index 000000000..285648adc
--- /dev/null
+++ b/tests/models/test_glm4_moe_awq.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from model_test import ModelTest
+
+from gptqmodel.quantization import FORMAT, METHOD
+from gptqmodel.utils.eval import EVAL
+
+
+class TestGlm4Moe(ModelTest):
+    FORMAT = FORMAT.GEMM
+    METHOD = METHOD.AWQ
+
+    NATIVE_MODEL_ID = "/monster/data/model/GLM-4.6/"
+    DELETE_QUANTIZED_MODEL = False
+    DATASET_SIZE = 512
+    GROUP_SIZE = 32
+    EVAL_TASKS = {
+        EVAL.LM_EVAL.ARC_CHALLENGE: {
+            "acc": {"value": 0.5026, "floor_pct": 0.04},
+            "acc_norm": {"value": 0.5171, "floor_pct": 0.04},
+        },
+        EVAL.LM_EVAL.MMLU_STEM: {
+            "acc": {"value": 0.6362, "floor_pct": 0.04},
+        },
+    }
+    def test_glm4moe(self):
+        self.quant_lm_eval()
diff --git a/tests/models/test_llama3_2_awq.py b/tests/models/test_llama3_2_awq.py
index 380ed3066..48c4a8c45 100644
--- a/tests/models/test_llama3_2_awq.py
+++ b/tests/models/test_llama3_2_awq.py
@@ -13,31 +13,34 @@
 # desc_act = False, act_group_aware = False 0.2500/0.2841
 # desc_act = False, act_group_aware = True 0.3063/0.3456
 # desc_act = True, 0.3089/0.3328
-class TestLlama3_2(ModelTest):
+class TestLlama3_2_awq(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct" # "meta-llama/Llama-3.2-1B-Instruct"
+    EVAL_BATCH_SIZE = 64
     EVAL_TASKS = {
         EVAL.LM_EVAL.ARC_CHALLENGE: {
             "chat_template": True,
-            "acc": {"value": 0.3234, "floor_pct": 0.36},
-            "acc_norm": {"value": 0.3524, "floor_pct": 0.36},
+            "acc": {
+                "value": 0.3200,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
+            "acc_norm": {
+                "value": 0.3362,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
+        },
+        EVAL.LM_EVAL.MMLU_STEM: {
+            "chat_template": False,
+            "acc": {
+                "value": 0.3657,
+                "floor_pct": 0.04,
+                "ceil_pct": 0.10,
+            },
         },
     }
-    V2 = False
-    DEBUG = True
-    ACT_GROUP_AWARE = False
-    DESC_ACT = False
-    DATASET_SIZE = 1024
-    DATASET_SORT = "desc"
-    QUANT_BATCH_SIZE = 4
     FORMAT = FORMAT.GEMM
     METHOD = METHOD.AWQ
-    # USE_FLASH_ATTN = False
-    # EORA = Lora(
-    #     # for quant, path is save path. for load, it is loading path
-    #     path="./eora_test",
-    #     rank=128,
-    # )
-    # b1 = 0.315, b4 = 0.3106, b8 = 0.3148, b32 = 0.3148, b16 = 0.3234
 
-    def test_llama3_2(self):
+    def test_llama3_2_awq(self):
         self.quant_lm_eval()
diff --git a/tests/models/test_marin.py b/tests/models/test_marin.py
index 4ad9dca07..a9d08be4d 100644
--- a/tests/models/test_marin.py
+++ b/tests/models/test_marin.py
@@ -4,12 +4,11 @@
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
 from accelerate import init_empty_weights
-from transformers import AutoConfig, AutoModelForCausalLM
-
 from model_test import ModelTest
+from transformers import AutoConfig, AutoModelForCausalLM
 
 from gptqmodel.models.definitions.qwen3 import Qwen3QModel
-from gptqmodel.quantization.config import VRAMStrategy
+from gptqmodel.utils.eval import EVAL
 
 
 class TestMarin(ModelTest):
diff --git a/tests/models/test_marin_awq.py b/tests/models/test_marin_awq.py
new file mode 100644
index 000000000..f43f0ce3c
--- /dev/null
+++ b/tests/models/test_marin_awq.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from accelerate import init_empty_weights
+from model_test import ModelTest
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from gptqmodel.models.definitions.qwen3 import Qwen3QModel
+from gptqmodel.quantization.config import FORMAT, METHOD
+from gptqmodel.utils.eval import EVAL
+
+
+class TestMarin(ModelTest):
+    DATASET_SIZE = 1024
+    GROUP_SIZE = 32
+    METHOD = METHOD.AWQ
+    FORMAT = FORMAT.GEMM
+
+    NATIVE_MODEL_ID = "/monster/data/model/marin-32b-base"
+    # VRAM_STRATEGY = VRAMStrategy.BALANCED
+    # Marin inherits Qwen3's backbone with QK-Norm attention.
+    EVAL_TASKS = {
+        EVAL.LM_EVAL.ARC_CHALLENGE: {
+            "acc": {"value": 0.5828, "floor_pct": 0.04},
+            "acc_norm": {"value": 0.6007, "floor_pct": 0.04},
+        },
+        EVAL.LM_EVAL.MMLU_STEM: {
+            "acc": {"value": 0.6673, "floor_pct": 0.04},
+        },
+    }
+
+    def test_marin_module_tree(self):
+        config = AutoConfig.from_pretrained(self.NATIVE_MODEL_ID, trust_remote_code=True)
+        with init_empty_weights(include_buffers=True):
+            shell = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+
+        decoder_layer = shell.model.layers[0]
+        self.assertTrue(hasattr(decoder_layer.self_attn, "q_norm"))
+        self.assertTrue(hasattr(decoder_layer.self_attn, "k_norm"))
+        self.assertTrue(hasattr(decoder_layer.self_attn, "q_proj"))
+        self.assertTrue(hasattr(decoder_layer.self_attn, "o_proj"))
+        self.assertIn("q_norm:!", Qwen3QModel.module_tree[3]["self_attn"])
+        self.assertIn("k_norm:!", Qwen3QModel.module_tree[3]["self_attn"])
+
+    def test_marin(self):
+        self.quant_lm_eval()
diff --git a/tests/test_kernel_output_awq.py b/tests/test_kernel_output_awq.py
index 7b3c0a5bf..9e446beeb 100644
--- a/tests/test_kernel_output_awq.py
+++ b/tests/test_kernel_output_awq.py
@@ -47,9 +47,9 @@ class TestAwqKernelOutput(unittest.TestCase):
     backend_cases = [
         (baseline_backend, torch.float16, 0.0),
         # (baseline_backend, torch.bfloat16, 0.0),
-        (BACKEND.GEMM, torch.float16, 0.001),
+        (BACKEND.GEMM, torch.float16, 0.004),
         # (BACKEND.GEMM, torch.bfloat16, 0.05),
-        (BACKEND.MARLIN, torch.float16, 0.01),
+        (BACKEND.MARLIN, torch.float16, 0.006),
         # (BACKEND.MARLIN, torch.bfloat16, 0.05),
     ]