From fab1126992482cb3af61699d34f3029b4bc10b77 Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Sun, 5 Oct 2025 05:17:17 +0000
Subject: [PATCH 1/5] update progressbar for module finalization (packing and
 offloading)

Signed-off-by: Qubitium <Qubitium@modelcloud.ai>
---
 gptqmodel/looper/module_looper.py | 60 ++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
index 06f1bd0a2..2f733b322 100644
--- a/gptqmodel/looper/module_looper.py
+++ b/gptqmodel/looper/module_looper.py
@@ -925,37 +925,57 @@ def _process_on_worker(proc: LoopProcessor, nm: NamedModule):
                     torch_sync()
 
                     # Gather finalize tasks (can offload to disk); run them via the pool
-                    finalize_futures = []
+                    finalize_tasks = []
 
                     for reverse_p in reversed(self.processors):
-                        for name in processed_subset:
-                            @torch.inference_mode()
-                            def finalize_module(process, module):
-                                process.submodule_finalize(module, self.gptq_model)
+                        for module in processed_subset.values():
+                            target_dev = get_device_new(module, recursive=True, assert_mode=True, expected="cpu")
+                            module_label = getattr(module, "full_name", getattr(module, "name", ""))
+                            finalize_tasks.append((reverse_p, module, module_label, target_dev))
 
-                                # Disk offload (lifecycle TODO note preserved)
-                                if isinstance(process, (GPTQProcessor, QQQProcessor, AWQProcessor)):
-                                    offload_to_disk(
-                                        model=self.gptq_model.model,
-                                        module=self.gptq_model.model.get_submodule(module.full_name),
-                                        disk_path=self.gptq_model.quantize_config.offload_to_disk_path,
-                                    )
+                    finalize_count = len(finalize_tasks)
+                    if finalize_count:
+                        quant_modules_pb.subtitle(
+                            f"Finalizing submodules ({finalize_count})"
+                        ).draw()
 
-                            module = processed_subset[name]
+                    finalize_futures = []
 
-                            target_dev = get_device_new(module, recursive=True, assert_mode=True, expected="cpu")
+                    @torch.inference_mode()
+                    def _finalize_on_worker(process, module, idx, total, module_label):
+                        quant_modules_pb.subtitle(
+                            f"{process.name()}: finalizing {idx}/{total} ({module_label})"
+                        ).draw()
+
+                        process.submodule_finalize(module, self.gptq_model)
+
+                        # Disk offload (lifecycle TODO note preserved)
+                        if isinstance(process, (GPTQProcessor, QQQProcessor, AWQProcessor)):
+                            offload_to_disk(
+                                model=self.gptq_model.model,
+                                module=self.gptq_model.model.get_submodule(module.full_name),
+                                disk_path=self.gptq_model.quantize_config.offload_to_disk_path,
+                            )
 
-                            # Submit on the module's device thread (safe & deterministic)
-                            finalize_futures.append(
-                                DEVICE_THREAD_POOL.submit_serial(
-                                    target_dev, finalize_module, reverse_p, module
-                                )
+                    for index, (process, module, module_label, target_dev) in enumerate(finalize_tasks, start=1):
+                        finalize_futures.append(
+                            DEVICE_THREAD_POOL.submit_serial(
+                                target_dev,
+                                _finalize_on_worker,
+                                process,
+                                module,
+                                index,
+                                finalize_count,
+                                module_label,
                             )
+                        )
 
-                    # If any finalize tasks were queued, wait for them
                     for fut in finalize_futures:
                         fut.result()
 
+                    if finalize_count:
+                        quant_modules_pb.subtitle("").draw()
+
         # LifeCycle: All sub-modules have finalized meaning quantization work is complete
         # Ensure ANY remaining tasks the looper submitted have drained
         DEVICE_THREAD_POOL.wait()  # same as wait('all')

From f486f8191d4d440d9baa1a2539729f13973fb779 Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Sun, 5 Oct 2025 05:51:21 +0000
Subject: [PATCH 2/5] merge offloaded fies into 1 single dat file instead of 1
 per param/buffer

Signed-off-by: Qubitium <Qubitium@modelcloud.ai>
---
 gptqmodel/utils/model.py    |  19 ++++-
 gptqmodel/utils/offload.py  | 156 +++++++++++++++++++++++++++++++++++-
 tests/test_offload_files.py |  61 ++++++++++++++
 3 files changed, 232 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_offload_files.py

diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
index 3ef446fd2..f928c0605 100644
--- a/gptqmodel/utils/model.py
+++ b/gptqmodel/utils/model.py
@@ -1261,6 +1261,20 @@ def _resolve_offload_entry(
             data_offsets=offsets,
         )
 
+    filename = entry.get("filename")
+    if filename:
+        path = filename if os.path.isabs(filename) else os.path.join(module_dir, filename)
+        start = int(entry.get("offset", 0))
+        end = start + (_torch_dtype_num_bytes(resolved_dtype) * math.prod(shape or (1,)))
+        return OffloadTensorRef(
+            path=os.path.abspath(path),
+            dtype=resolved_dtype,
+            shape=shape,
+            format="dat",
+            weight_name=None,
+            data_offsets=(start, end),
+        )
+
     data_path = os.path.join(module_dir, f"{leaf}.dat")
     if not os.path.isfile(data_path):
         return None
@@ -1450,7 +1464,10 @@ def _write_shard_file(path: str, entries: List[TensorSource], metadata: Dict[str
             if isinstance(source, OffloadTensorRef):
                 if source.format == "dat":
                     # print("offload tensor io buffered transfer DAT")
-                    _copy_file_stream(source.path, out, entry.num_bytes)
+                    start = 0
+                    if source.data_offsets is not None:
+                        start = source.data_offsets[0]
+                    _copy_file_stream(source.path, out, entry.num_bytes, offset=start)
                 elif source.format == "safetensors" and source.data_offsets is not None:
                     # print("offload tensor io buffered transfer SAFETENSOR stream")
                     start, end = source.data_offsets
diff --git a/gptqmodel/utils/offload.py b/gptqmodel/utils/offload.py
index 37f7c765e..a40a6055b 100644
--- a/gptqmodel/utils/offload.py
+++ b/gptqmodel/utils/offload.py
@@ -4,17 +4,21 @@
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
 import contextlib
+import json
 import os
 import shutil
 from threading import Lock
 from typing import Iterable, List, Optional, Set, Tuple
 
 import accelerate
+import accelerate.utils.offload as accelerate_offload
 import torch
+import numpy as np
 
 # move base_module tensors to disk
 from accelerate import disk_offload
 from accelerate.hooks import remove_hook_from_module, remove_hook_from_submodules
+from accelerate.utils import OffloadedWeightsLoader as _AccelerateOffloadedWeightsLoader
 from accelerate.utils import align_module_device, has_offloaded_params
 from torch import nn
 
@@ -65,6 +69,146 @@ def is_meta_module(m: nn.Module) -> bool:
 _OFFLOAD_LOCK = Lock()
 
 
+_ORIGINAL_OFFLOADED_LOADER = accelerate_offload.OffloadedWeightsLoader
+_ORIGINAL_LOAD_WEIGHT = accelerate_offload.load_offloaded_weight
+
+
+class _ModuleBundledWeightsLoader(_AccelerateOffloadedWeightsLoader):
+    def __getitem__(self, key: str):
+        weight_info = self.index.get(key)
+        bundle_file = None if weight_info is None else weight_info.get("filename")
+        if bundle_file:
+            file_path = bundle_file if os.path.isabs(bundle_file) else os.path.join(self.save_folder, bundle_file)
+            storage_dtype = weight_info.get("storage_dtype", weight_info.get("dtype"))
+            offset = weight_info.get("offset", 0)
+            shape = tuple(weight_info.get("shape", ()))
+            squeezed = bool(weight_info.get("squeezed", False))
+
+            memmap_shape = shape if len(shape) > 0 else (1,)
+            np_dtype = np.dtype(storage_dtype)
+            mm = np.memmap(file_path, dtype=np_dtype, mode="r", offset=offset, shape=memmap_shape, order="C")
+            array = np.array(mm, copy=True)
+            del mm
+
+            tensor = torch.from_numpy(array)
+            target_dtype = weight_info.get("dtype")
+            if target_dtype == "bfloat16":
+                tensor = tensor.view(torch.bfloat16)
+            elif target_dtype is not None and target_dtype != storage_dtype:
+                tensor = tensor.to(getattr(torch, target_dtype))
+
+            if squeezed and len(shape) == 0:
+                tensor = tensor.reshape(())
+            return tensor
+
+        return super().__getitem__(key)
+
+
+def _load_offloaded_weight_with_bundle(weight_file, weight_info):
+    bundle_file = weight_info.get("filename") if isinstance(weight_info, dict) else None
+    if bundle_file:
+        folder = os.path.dirname(weight_file)
+        file_path = bundle_file if os.path.isabs(bundle_file) else os.path.join(folder, bundle_file)
+        storage_dtype = weight_info.get("storage_dtype", weight_info.get("dtype"))
+        offset = weight_info.get("offset", 0)
+        shape = tuple(weight_info.get("shape", ()))
+        squeezed = bool(weight_info.get("squeezed", False))
+
+        memmap_shape = shape if len(shape) > 0 else (1,)
+        np_dtype = np.dtype(storage_dtype)
+        mm = np.memmap(file_path, dtype=np_dtype, mode="r", offset=offset, shape=memmap_shape, order="C")
+        array = np.array(mm, copy=True)
+        del mm
+
+        tensor = torch.from_numpy(array)
+        target_dtype = weight_info.get("dtype")
+        if target_dtype == "bfloat16":
+            tensor = tensor.view(torch.bfloat16)
+        elif target_dtype is not None and target_dtype != storage_dtype:
+            tensor = tensor.to(getattr(torch, target_dtype))
+
+        if squeezed and len(shape) == 0:
+            tensor = tensor.reshape(())
+        return tensor
+
+    return _ORIGINAL_LOAD_WEIGHT(weight_file, weight_info)
+
+
+_ACCELERATE_PATCHED = False
+
+
+def _ensure_accelerate_bundle_patch():
+    global _ACCELERATE_PATCHED
+    if _ACCELERATE_PATCHED:
+        return
+
+    accelerate.utils.OffloadedWeightsLoader = _ModuleBundledWeightsLoader
+    accelerate_offload.OffloadedWeightsLoader = _ModuleBundledWeightsLoader
+    accelerate.utils.load_offloaded_weight = _load_offloaded_weight_with_bundle
+    accelerate_offload.load_offloaded_weight = _load_offloaded_weight_with_bundle
+    _ACCELERATE_PATCHED = True
+
+
+def _prepare_offload_directory(target_dir: str) -> None:
+    if os.path.isdir(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir, exist_ok=True)
+
+
+def _bundle_module_state_dict(module: nn.Module, offload_dir: str) -> dict:
+    bundle_path = os.path.join(offload_dir, "module.dat")
+    index: dict[str, dict] = {}
+    offset = 0
+
+    with open(bundle_path, "wb") as bundle_fp:
+        state_items = module.state_dict()
+
+        for key, tensor in state_items.items():
+            with torch.no_grad():
+                tensor_cpu = tensor.detach().to("cpu")
+
+            storage_tensor = tensor_cpu
+            storage_dtype = storage_tensor.dtype
+            target_dtype = tensor_cpu.dtype
+
+            if target_dtype == torch.bfloat16:
+                storage_tensor = storage_tensor.view(torch.int16)
+                storage_dtype = torch.int16
+
+            storage_tensor = storage_tensor.contiguous()
+            array = storage_tensor.numpy()
+            squeezed = False
+            if array.ndim == 0:
+                array = array.reshape(1)
+                squeezed = True
+
+            bundle_fp.write(array.tobytes(order="C"))
+
+            dtype_str = "bfloat16" if target_dtype == torch.bfloat16 else str(array.dtype)
+            entry = {
+                "dtype": dtype_str,
+                "shape": list(tensor.shape),
+                "filename": "module.dat",
+                "offset": offset,
+            }
+
+            if squeezed:
+                entry["squeezed"] = True
+
+            storage_dtype_str = str(array.dtype)
+            if storage_dtype_str != dtype_str:
+                entry["storage_dtype"] = storage_dtype_str
+
+            index[key] = entry
+            offset += array.nbytes
+
+    index_path = os.path.join(offload_dir, "index.json")
+    with open(index_path, "w", encoding="utf-8") as fp:
+        json.dump(index, fp, indent=2)
+
+    return index
+
+
 def offload_to_disk(module: List[str] | nn.Module, model: nn.Module, disk_path: str = "."):
     with _OFFLOAD_LOCK:
         _offload_to_disk_impl(module=module, model=model, disk_path=disk_path)
@@ -119,11 +263,17 @@ def _offload_disk(module: nn.Module, name: str, disk_path: str = "."):
     if not has_params and not has_buffers:
         return
 
+    module_offload_dir = os.path.join(disk_path, name)
+
+    _prepare_offload_directory(module_offload_dir)
+    _bundle_module_state_dict(module, module_offload_dir)
+
+    _ensure_accelerate_bundle_patch()
+
     _ = disk_offload(
         module,
-        # device_map={ "" : "disk" },  # only touch this subtree
-        offload_dir=f"{disk_path}/{name}",
-        offload_buffers=True,  # needed for buffers
+        offload_dir=module_offload_dir,
+        offload_buffers=True,
         execution_device=m_device,
     )
 
diff --git a/tests/test_offload_files.py b/tests/test_offload_files.py
new file mode 100644
index 000000000..d118ce6bc
--- /dev/null
+++ b/tests/test_offload_files.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+import json
+
+import torch
+from tabulate import tabulate
+from torch import nn
+
+from gptqmodel.utils.offload import offload_to_disk, undo_offload_to_disk
+
+
+class _LinearWithBuffers(nn.Module):
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self.linear = nn.Linear(in_features, out_features, bias=True)
+        self.register_buffer("scale_buffer", torch.linspace(0.0, 1.0, out_features))
+        self.register_buffer("mask_buffer", torch.randint(0, 2, (out_features, in_features)).bool())
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(x * self.mask_buffer.float()) * self.scale_buffer
+
+
+def _clone_state_dict(module: nn.Module) -> dict[str, torch.Tensor]:
+    return {k: v.detach().clone() for k, v in module.state_dict().items()}
+
+
+def test_offload_to_disk_writes_single_dat_file(tmp_path):
+    model = _LinearWithBuffers(in_features=128, out_features=96)
+    original_state = _clone_state_dict(model.linear)
+
+    offload_root = tmp_path / "offload_root"
+    offload_to_disk(module=model.linear, model=model, disk_path=str(offload_root))
+
+    module_dir = offload_root / "linear"
+    assert module_dir.is_dir(), "Expected per-module directory to exist"
+
+    files = sorted(module_dir.iterdir(), key=lambda p: p.name)
+    rows = [(path.name, path.stat().st_size) for path in files]
+    print(tabulate(rows, headers=["file", "bytes"], tablefmt="github"))
+
+    dat_files = [path for path in files if path.suffix == ".dat"]
+    assert len(dat_files) == 1, "offload_to_disk should produce exactly one .dat file per module"
+    assert dat_files[0].name == "module.dat"
+
+    with open(module_dir / "index.json", encoding="utf-8") as fp:
+        index = json.load(fp)
+
+    expected_keys = set(model.linear.state_dict().keys())
+    assert set(index.keys()) == expected_keys
+    assert all(entry.get("filename") == "module.dat" for entry in index.values())
+
+    offsets = [entry["offset"] for entry in index.values()]
+    assert offsets == sorted(offsets), "Offsets should be monotonically increasing"
+
+    # Materialize the module back and ensure values match the snapshot captured before offload.
+    undo_offload_to_disk(model.linear, delete_offload_folders=False)
+    for name, tensor in model.linear.state_dict().items():
+        torch.testing.assert_close(tensor, original_state[name])

From 0a9cc4f80c5930f4ff1da48e525a5b10c15c9660 Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Sun, 5 Oct 2025 05:58:43 +0000
Subject: [PATCH 3/5] use safetensors instead of manual numpy mmap dump

Signed-off-by: Qubitium <Qubitium@modelcloud.ai>
---
 gptqmodel/utils/offload.py  | 149 +++++++-----------------------------
 tests/test_offload_files.py |  13 ++--
 2 files changed, 32 insertions(+), 130 deletions(-)

diff --git a/gptqmodel/utils/offload.py b/gptqmodel/utils/offload.py
index a40a6055b..cd4db3590 100644
--- a/gptqmodel/utils/offload.py
+++ b/gptqmodel/utils/offload.py
@@ -7,20 +7,19 @@
 import json
 import os
 import shutil
+import struct
 from threading import Lock
 from typing import Iterable, List, Optional, Set, Tuple
 
 import accelerate
-import accelerate.utils.offload as accelerate_offload
 import torch
-import numpy as np
 
 # move base_module tensors to disk
 from accelerate import disk_offload
 from accelerate.hooks import remove_hook_from_module, remove_hook_from_submodules
-from accelerate.utils import OffloadedWeightsLoader as _AccelerateOffloadedWeightsLoader
 from accelerate.utils import align_module_device, has_offloaded_params
 from torch import nn
+from safetensors.torch import save_file as safetensors_save_file
 
 from ..looper.named_module import NamedModule
 from .device import get_device
@@ -69,86 +68,6 @@ def is_meta_module(m: nn.Module) -> bool:
 _OFFLOAD_LOCK = Lock()
 
 
-_ORIGINAL_OFFLOADED_LOADER = accelerate_offload.OffloadedWeightsLoader
-_ORIGINAL_LOAD_WEIGHT = accelerate_offload.load_offloaded_weight
-
-
-class _ModuleBundledWeightsLoader(_AccelerateOffloadedWeightsLoader):
-    def __getitem__(self, key: str):
-        weight_info = self.index.get(key)
-        bundle_file = None if weight_info is None else weight_info.get("filename")
-        if bundle_file:
-            file_path = bundle_file if os.path.isabs(bundle_file) else os.path.join(self.save_folder, bundle_file)
-            storage_dtype = weight_info.get("storage_dtype", weight_info.get("dtype"))
-            offset = weight_info.get("offset", 0)
-            shape = tuple(weight_info.get("shape", ()))
-            squeezed = bool(weight_info.get("squeezed", False))
-
-            memmap_shape = shape if len(shape) > 0 else (1,)
-            np_dtype = np.dtype(storage_dtype)
-            mm = np.memmap(file_path, dtype=np_dtype, mode="r", offset=offset, shape=memmap_shape, order="C")
-            array = np.array(mm, copy=True)
-            del mm
-
-            tensor = torch.from_numpy(array)
-            target_dtype = weight_info.get("dtype")
-            if target_dtype == "bfloat16":
-                tensor = tensor.view(torch.bfloat16)
-            elif target_dtype is not None and target_dtype != storage_dtype:
-                tensor = tensor.to(getattr(torch, target_dtype))
-
-            if squeezed and len(shape) == 0:
-                tensor = tensor.reshape(())
-            return tensor
-
-        return super().__getitem__(key)
-
-
-def _load_offloaded_weight_with_bundle(weight_file, weight_info):
-    bundle_file = weight_info.get("filename") if isinstance(weight_info, dict) else None
-    if bundle_file:
-        folder = os.path.dirname(weight_file)
-        file_path = bundle_file if os.path.isabs(bundle_file) else os.path.join(folder, bundle_file)
-        storage_dtype = weight_info.get("storage_dtype", weight_info.get("dtype"))
-        offset = weight_info.get("offset", 0)
-        shape = tuple(weight_info.get("shape", ()))
-        squeezed = bool(weight_info.get("squeezed", False))
-
-        memmap_shape = shape if len(shape) > 0 else (1,)
-        np_dtype = np.dtype(storage_dtype)
-        mm = np.memmap(file_path, dtype=np_dtype, mode="r", offset=offset, shape=memmap_shape, order="C")
-        array = np.array(mm, copy=True)
-        del mm
-
-        tensor = torch.from_numpy(array)
-        target_dtype = weight_info.get("dtype")
-        if target_dtype == "bfloat16":
-            tensor = tensor.view(torch.bfloat16)
-        elif target_dtype is not None and target_dtype != storage_dtype:
-            tensor = tensor.to(getattr(torch, target_dtype))
-
-        if squeezed and len(shape) == 0:
-            tensor = tensor.reshape(())
-        return tensor
-
-    return _ORIGINAL_LOAD_WEIGHT(weight_file, weight_info)
-
-
-_ACCELERATE_PATCHED = False
-
-
-def _ensure_accelerate_bundle_patch():
-    global _ACCELERATE_PATCHED
-    if _ACCELERATE_PATCHED:
-        return
-
-    accelerate.utils.OffloadedWeightsLoader = _ModuleBundledWeightsLoader
-    accelerate_offload.OffloadedWeightsLoader = _ModuleBundledWeightsLoader
-    accelerate.utils.load_offloaded_weight = _load_offloaded_weight_with_bundle
-    accelerate_offload.load_offloaded_weight = _load_offloaded_weight_with_bundle
-    _ACCELERATE_PATCHED = True
-
-
 def _prepare_offload_directory(target_dir: str) -> None:
     if os.path.isdir(target_dir):
         shutil.rmtree(target_dir)
@@ -156,51 +75,37 @@ def _prepare_offload_directory(target_dir: str) -> None:
 
 
 def _bundle_module_state_dict(module: nn.Module, offload_dir: str) -> dict:
-    bundle_path = os.path.join(offload_dir, "module.dat")
+    bundle_path = os.path.join(offload_dir, "module.safetensors")
     index: dict[str, dict] = {}
-    offset = 0
-
-    with open(bundle_path, "wb") as bundle_fp:
-        state_items = module.state_dict()
-
-        for key, tensor in state_items.items():
-            with torch.no_grad():
-                tensor_cpu = tensor.detach().to("cpu")
-
-            storage_tensor = tensor_cpu
-            storage_dtype = storage_tensor.dtype
-            target_dtype = tensor_cpu.dtype
+    tensors: dict[str, torch.Tensor] = {}
 
-            if target_dtype == torch.bfloat16:
-                storage_tensor = storage_tensor.view(torch.int16)
-                storage_dtype = torch.int16
-
-            storage_tensor = storage_tensor.contiguous()
-            array = storage_tensor.numpy()
-            squeezed = False
-            if array.ndim == 0:
-                array = array.reshape(1)
-                squeezed = True
-
-            bundle_fp.write(array.tobytes(order="C"))
-
-            dtype_str = "bfloat16" if target_dtype == torch.bfloat16 else str(array.dtype)
+    with torch.inference_mode():
+        for key, tensor in module.state_dict().items():
+            cpu_tensor = tensor.detach().to("cpu")
+            tensors[key] = cpu_tensor.contiguous()
             entry = {
-                "dtype": dtype_str,
-                "shape": list(tensor.shape),
-                "filename": "module.dat",
-                "offset": offset,
+                "dtype": str(cpu_tensor.dtype).replace("torch.", ""),
+                "shape": list(cpu_tensor.shape),
+                "safetensors_file": os.path.abspath(bundle_path),
+                "weight_name": key,
             }
+            index[key] = entry
 
-            if squeezed:
-                entry["squeezed"] = True
+    safetensors_save_file(tensors, bundle_path)
 
-            storage_dtype_str = str(array.dtype)
-            if storage_dtype_str != dtype_str:
-                entry["storage_dtype"] = storage_dtype_str
+    with open(bundle_path, "rb") as fh:
+        header_len = struct.unpack("<Q", fh.read(8))[0]
+        header = json.loads(fh.read(header_len).decode("utf-8"))
 
-            index[key] = entry
-            offset += array.nbytes
+    for key, tensor_meta in header.items():
+        if key == "__metadata__":
+            continue
+        entry = index.get(key)
+        if entry is None:
+            continue
+        offsets = tensor_meta.get("data_offsets")
+        if offsets is not None:
+            entry["data_offsets"] = offsets
 
     index_path = os.path.join(offload_dir, "index.json")
     with open(index_path, "w", encoding="utf-8") as fp:
@@ -268,8 +173,6 @@ def _offload_disk(module: nn.Module, name: str, disk_path: str = "."):
     _prepare_offload_directory(module_offload_dir)
     _bundle_module_state_dict(module, module_offload_dir)
 
-    _ensure_accelerate_bundle_patch()
-
     _ = disk_offload(
         module,
         offload_dir=module_offload_dir,
diff --git a/tests/test_offload_files.py b/tests/test_offload_files.py
index d118ce6bc..29964299b 100644
--- a/tests/test_offload_files.py
+++ b/tests/test_offload_files.py
@@ -4,6 +4,7 @@
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 
 import json
+from pathlib import Path
 
 import torch
 from tabulate import tabulate
@@ -41,19 +42,17 @@ def test_offload_to_disk_writes_single_dat_file(tmp_path):
     rows = [(path.name, path.stat().st_size) for path in files]
     print(tabulate(rows, headers=["file", "bytes"], tablefmt="github"))
 
-    dat_files = [path for path in files if path.suffix == ".dat"]
-    assert len(dat_files) == 1, "offload_to_disk should produce exactly one .dat file per module"
-    assert dat_files[0].name == "module.dat"
+    safetensor_files = [path for path in files if path.suffix == ".safetensors"]
+    assert len(safetensor_files) == 1, "offload_to_disk should produce exactly one safetensors file per module"
+    assert safetensor_files[0].name == "module.safetensors"
 
     with open(module_dir / "index.json", encoding="utf-8") as fp:
         index = json.load(fp)
 
     expected_keys = set(model.linear.state_dict().keys())
     assert set(index.keys()) == expected_keys
-    assert all(entry.get("filename") == "module.dat" for entry in index.values())
-
-    offsets = [entry["offset"] for entry in index.values()]
-    assert offsets == sorted(offsets), "Offsets should be monotonically increasing"
+    assert all(Path(entry.get("safetensors_file")).name == "module.safetensors" for entry in index.values())
+    assert all(entry.get("data_offsets") is not None for entry in index.values())
 
     # Materialize the module back and ensure values match the snapshot captured before offload.
     undo_offload_to_disk(model.linear, delete_offload_folders=False)

From bfeda07f972a11693b5375238f19cb35ecef1433 Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Sun, 5 Oct 2025 06:26:56 +0000
Subject: [PATCH 4/5] use safetensors instead of numpy mmap dump

Signed-off-by: Qubitium <Qubitium@modelcloud.ai>
---
 gptqmodel/looper/loop_processor.py | 50 +++++++++++++++++++++---------
 gptqmodel/utils/offload.py         |  4 ++-
 tests/test_offload_files.py        | 21 +++++++++++++
 3 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/gptqmodel/looper/loop_processor.py b/gptqmodel/looper/loop_processor.py
index e177434d4..7c6fc4ceb 100644
--- a/gptqmodel/looper/loop_processor.py
+++ b/gptqmodel/looper/loop_processor.py
@@ -4,6 +4,7 @@
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 import json
 import queue
+import re
 import threading
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple
@@ -26,6 +27,8 @@
 # global level lock
 PROCESSOR_GLOBAL_LOCK = threading.Lock()
 
+ANSI_ESCAPE_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+
 # LoopProcessor is a singleton(), not per module instance
 class LoopProcessor:
     def __init__(
@@ -191,41 +194,58 @@ def loss_color(self, loss_value):
         else:
             return "\033[91m"  # Red
 
+    def _strip_ansi(self, text: Any) -> str:
+        return ANSI_ESCAPE_RE.sub("", str(text))
+
+    def _visible_length(self, text: Any) -> int:
+        return len(self._strip_ansi(text))
+
+    def _ljust_visible(self, text: str, width: int) -> str:
+        padding = max(width - self._visible_length(text), 0)
+        if padding:
+            return f"{text}{' ' * padding}"
+        return text
+
     def log_new_row(self, stat):
         self.log_call_count += 1
         self.log_save_async(stat) # write to temp log file
 
         # Update max_widths with the new row's column widths
         for key, value in stat.items():
-            current_width = max(len(str(key)), len(str(value))) + 4 # 4 is for padding
+            key_str = str(key)
+            value_str = str(value)
+            current_width = max(self._visible_length(key_str), self._visible_length(value_str)) + 4 # 4 is for padding
             if key not in self.log_max_widths or current_width > self.log_max_widths[key]:
                 self.log_max_widths[key] = current_width
 
         if self.log_call_count % 20 == 1:
             # Format the header row
-            header_row = "| " + " | ".join(
-                str(key).ljust(self.log_max_widths[key]) for key in self.log_max_widths.keys()) + " |"
+            header_cells = [
+                self._ljust_visible(str(key), self.log_max_widths[key]) for key in self.log_max_widths.keys()
+            ]
+            header_row = "| " + " | ".join(header_cells) + " |"
+            header_separator = "-" * self._visible_length(header_row)
 
             if self.log_call_count == 1:
-                log.info(len(header_row) * "-")
+                log.info(header_separator)
             log.info(header_row)
-            log.info(len(header_row) * "-")
+            log.info(header_separator)
 
-        formatted_row = "| "
+        row_cells = []
         for key in self.log_max_widths.keys():
             value = stat.get(key, "")
-            if key == "loss":
-                color_code = self.loss_color(float(value))
-                formatted_value = f"{color_code}{value}\033[0m"
+            value_str = str(value)
+            if key == "loss" and value_str:
+                color_code = self.loss_color(float(value_str))
+                formatted_value = f"{color_code}{value_str}\033[0m"
             else:
-                formatted_value = str(value)
-            formatted_row += formatted_value.ljust(self.log_max_widths[key]) + " | "
-
-        # formatted_row = "| " + " | ".join(
-        #     str(stat.get(key, "")).ljust(self.log_max_widths[key]) for key in self.log_max_widths.keys()) + " |"
+                formatted_value = value_str
+            row_cells.append(self._ljust_visible(formatted_value, self.log_max_widths[key]))
 
+        formatted_row = "| " + " | ".join(row_cells) + " |"
+        row_separator = "-" * self._visible_length(formatted_row)
         log.info(formatted_row)
-        log.info(len(formatted_row) * "-")
+        log.info(row_separator)
 
     def _init_device_smi_handles(self) -> Dict[str, Device]:
         handles: Dict[str, Device] = {}
diff --git a/gptqmodel/utils/offload.py b/gptqmodel/utils/offload.py
index cd4db3590..4075df08d 100644
--- a/gptqmodel/utils/offload.py
+++ b/gptqmodel/utils/offload.py
@@ -96,6 +96,7 @@ def _bundle_module_state_dict(module: nn.Module, offload_dir: str) -> dict:
     with open(bundle_path, "rb") as fh:
         header_len = struct.unpack("<Q", fh.read(8))[0]
         header = json.loads(fh.read(header_len).decode("utf-8"))
+        data_offset_base = fh.tell()
 
     for key, tensor_meta in header.items():
         if key == "__metadata__":
@@ -105,7 +106,8 @@ def _bundle_module_state_dict(module: nn.Module, offload_dir: str) -> dict:
             continue
         offsets = tensor_meta.get("data_offsets")
         if offsets is not None:
-            entry["data_offsets"] = offsets
+            start, end = (int(offsets[0]), int(offsets[1]))
+            entry["data_offsets"] = [data_offset_base + start, data_offset_base + end]
 
     index_path = os.path.join(offload_dir, "index.json")
     with open(index_path, "w", encoding="utf-8") as fp:
diff --git a/tests/test_offload_files.py b/tests/test_offload_files.py
index 29964299b..adf79e520 100644
--- a/tests/test_offload_files.py
+++ b/tests/test_offload_files.py
@@ -9,8 +9,10 @@
 import torch
 from tabulate import tabulate
 from torch import nn
+from safetensors import safe_open
 
 from gptqmodel.utils.offload import offload_to_disk, undo_offload_to_disk
+from gptqmodel.utils.model import get_state_dict_for_save, streaming_state_dict_to_shards
 
 
 class _LinearWithBuffers(nn.Module):
@@ -54,6 +56,25 @@ def test_offload_to_disk_writes_single_dat_file(tmp_path):
     assert all(Path(entry.get("safetensors_file")).name == "module.safetensors" for entry in index.values())
     assert all(entry.get("data_offsets") is not None for entry in index.values())
 
+    save_dir = tmp_path / "saved"
+    save_dir.mkdir()
+    state_dict = get_state_dict_for_save(model, offload_root=str(offload_root))
+    expected_files, tensor_to_filename, _ = streaming_state_dict_to_shards(
+        state_dict,
+        save_dir=str(save_dir),
+        model_base_name="model",
+        single_file_name="model.safetensors",
+        metadata={},
+        max_shard_size=None,
+    )
+
+    assert len(expected_files) == 1
+    shard_path = save_dir / expected_files[0]
+    with safe_open(str(shard_path), framework="pt", device="cpu") as handler:
+        for name, tensor in original_state.items():
+            saved = handler.get_tensor(f"linear.{name}")
+            torch.testing.assert_close(saved, tensor)
+
     # Materialize the module back and ensure values match the snapshot captured before offload.
     undo_offload_to_disk(model.linear, delete_offload_folders=False)
     for name, tensor in model.linear.state_dict().items():

From 599e1311caf62e35fad1f0dddfe9199c1dee80c9 Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Sun, 5 Oct 2025 06:28:55 +0000
Subject: [PATCH 5/5] ruff

Signed-off-by: Qubitium <Qubitium@modelcloud.ai>
---
 gptqmodel/utils/offload.py  | 2 +-
 tests/test_offload_files.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/utils/offload.py b/gptqmodel/utils/offload.py
index 4075df08d..c84285d75 100644
--- a/gptqmodel/utils/offload.py
+++ b/gptqmodel/utils/offload.py
@@ -18,8 +18,8 @@
 from accelerate import disk_offload
 from accelerate.hooks import remove_hook_from_module, remove_hook_from_submodules
 from accelerate.utils import align_module_device, has_offloaded_params
-from torch import nn
 from safetensors.torch import save_file as safetensors_save_file
+from torch import nn
 
 from ..looper.named_module import NamedModule
 from .device import get_device
diff --git a/tests/test_offload_files.py b/tests/test_offload_files.py
index adf79e520..80288f17b 100644
--- a/tests/test_offload_files.py
+++ b/tests/test_offload_files.py
@@ -7,12 +7,12 @@
 from pathlib import Path
 
 import torch
+from safetensors import safe_open
 from tabulate import tabulate
 from torch import nn
-from safetensors import safe_open
 
-from gptqmodel.utils.offload import offload_to_disk, undo_offload_to_disk
 from gptqmodel.utils.model import get_state_dict_for_save, streaming_state_dict_to_shards
+from gptqmodel.utils.offload import offload_to_disk, undo_offload_to_disk
 
 
 class _LinearWithBuffers(nn.Module):