ModelCloud · Qubitium · Oct 3, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -58,7 +58,7 @@ env:
   CUDA_DEVICE_ORDER: PCI_BUS_ID
   CUDA_VISIBLE_DEVICES: 0
   TORCH_CUDA_ARCH_LIST: '8.6 8.9 9.0 12.0'
-  PYTORCH_CUDA_ALLOC_CONF: 'expandable_segments:True'
+  PYTORCH_ALLOC_CONF: 'expandable_segments:True'
   MAX_JOBS: 4
   RUNNER: 10.0.13.31
   XEON5: 10.0.14.248

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,7 +1,10 @@
-global-include gptqmodel_ext/**/*.cuh
-global-include gptqmodel_ext/**/*.h
-global-include gptqmodel_ext/**/*.cpp
-global-include gptqmodel_ext/**/*.cu
-global-include gptqmodel_ext/**/*.py
+recursive-include gptqmodel_ext/awq *.h *.cuh *.cu *.cpp
+recursive-include gptqmodel_ext/exllama *.h *.cuh *.cu *.cpp
+recursive-include gptqmodel_ext/exllamav2 *.h *.cuh *.cu *.cpp
+recursive-include gptqmodel_ext/exllama_eora/eora *.h *.cuh *.cu *.cpp *.py
+recursive-include gptqmodel_ext/marlin *.h *.cuh *.cu *.cpp
+recursive-include gptqmodel_ext/qqq *.h *.cuh *.cu *.cpp
+include gptqmodel_ext/marlin/generate_kernels.py
+recursive-exclude gptqmodel_ext __pycache__ *.pyc
 prune tests/
 prune format/
diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
@@ -12,7 +12,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 
 if __name__ == "__main__":
     """

diff --git a/examples/eora/eora_generation.py b/examples/eora/eora_generation.py
@@ -18,7 +18,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 

diff --git a/examples/eora/eora_load_and_inference.py b/examples/eora/eora_load_and_inference.py
@@ -18,7 +18,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402

diff --git a/examples/eora/evaluation.py b/examples/eora/evaluation.py
@@ -18,7 +18,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 from typing import Optional  # noqa: E402

diff --git a/examples/eora/post_quant_eora_generation.py b/examples/eora/post_quant_eora_generation.py
@@ -18,7 +18,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 # -- end do not touch
 
 

diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py
@@ -14,7 +14,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 
 pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g"

diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py
@@ -11,7 +11,7 @@
 
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"

diff --git a/gptqmodel/looper/eora_processor.py b/gptqmodel/looper/eora_processor.py
@@ -22,6 +22,7 @@
 from ..utils.logger import setup_logger
 from ..utils.model import move_to
 from ..utils.torch import CPU, DEVICE_0, DEVICE_1, torch_streamCtx, torch_sync
+from ..utils.torch import HAS_CUDA, tf32_disable_guard, torch_streamCtx, torch_sync
 
 log = setup_logger()
 

diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -18,9 +18,9 @@
 #     os.environ["PYTHON_GIL"] = '0'
 #     log.info("ENV: Auto disable GIL and use free-threading mode when applicable: Python 3.13t+. You must install the -t edition of Python.")
 
-if not os.environ.get("PYTORCH_CUDA_ALLOC_CONF", None):
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True'
-    log.info("ENV: Auto setting PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' for memory saving.")
+if not os.environ.get("PYTORCH_ALLOC_CONF", None):
+    os.environ["PYTORCH_ALLOC_CONF"] = 'expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7'
+    log.info("ENV: Auto setting PYTORCH_ALLOC_CONF='expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7' for memory saving.")
 
 if not os.environ.get("CUDA_DEVICE_ORDER", None):
     os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'