From 1b57a67521acebdddd2aa6240633712e73a99f96 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Wed, 22 Oct 2025 15:58:35 +0800
Subject: [PATCH 1/3] include package data

---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 7fd343759..c84893a78 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,6 @@
 from setuptools import find_namespace_packages, find_packages, setup
 from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel
 
-
 CUTLASS_VERSION = "3.5.0"
 CUTLASS_RELEASE_URL = f"https://github.com/NVIDIA/cutlass/archive/refs/tags/v{CUTLASS_VERSION}.tar.gz"
 
@@ -420,6 +419,7 @@ def _resolve_wheel_url(tag_name: str, wheel_name: str) -> str:
     # Fallback: default GitHub template
     return DEFAULT_WHEEL_URL_TEMPLATE.format(tag_name=tag_name, wheel_name=wheel_name)
 
+
 # Decide HAS_CUDA_V8 / HAS_CUDA_V9 without torch
 HAS_CUDA_V8 = False
 HAS_CUDA_V9 = False
@@ -526,7 +526,7 @@ def _env_enabled_any(names, default="1") -> bool:
         print(f"Using NVCC_THREADS={nvcc_threads} for per-invocation NVCC concurrency.")
 
         # Optional conda CUDA runtime headers
-        #conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
+        # conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
         # if os.path.isdir(conda_cuda_include_dir):
         #     include_dirs.append(conda_cuda_include_dir)
         #     print(f"appending conda cuda include dir {conda_cuda_include_dir}")
@@ -786,6 +786,8 @@ def _hipify_compile_flags(flags):
 
         additional_setup_kwargs = {
             "ext_modules": extensions,
+            "include_package_data": True,
+            "package_data": {"": ["build/lib/*.so"]},
             "cmdclass": {"build_ext": cpp_ext.BuildExtension.with_options(
                 use_ninja=True,
                 no_python_abi_suffix=True,
@@ -850,7 +852,7 @@ def run(self):
         _packages.append(_pkg)
 
 setup(
-    version = gptqmodel_version,
+    version=gptqmodel_version,
     packages=_packages,
     include_package_data=True,
     extras_require={

From 3517cc44206a135bbdd0bee779160afed0ffc232 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Wed, 22 Oct 2025 16:46:14 +0800
Subject: [PATCH 2/3] disabele build_lib

---
 setup.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index c84893a78..aefb3cd17 100644
--- a/setup.py
+++ b/setup.py
@@ -614,7 +614,7 @@ def _hipify_compile_flags(flags):
         # Extensions (gate marlin/qqq/eora/exllamav2 on CUDA sm_80+ and non-ROCm)
         if sys.platform != "win32":
             if not ROCM_VERSION and HAS_CUDA_V8:
-                if BUILD_MARLIN:
+                if False and BUILD_MARLIN:
                     marlin_kernel_dir = Path("gptqmodel_ext/marlin")
                     marlin_kernel_files = sorted(marlin_kernel_dir.glob("kernel_*.cu"))
 
@@ -645,7 +645,7 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-                if BUILD_MACHETE and HAS_CUDA_V9 and _version_geq(NVCC_VERSION, 12, 0):
+                if False and BUILD_MACHETE and HAS_CUDA_V9 and _version_geq(NVCC_VERSION, 12, 0):
                     try:
                         result = subprocess.run(
                             [sys.executable, "gptqmodel_ext/machete/generate.py"],
@@ -686,7 +686,7 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-                if BUILD_QQQ:
+                if False and BUILD_QQQ:
                     extensions += [
                         cpp_ext.CUDAExtension(
                             "gptqmodel_qqq_kernels",
@@ -699,7 +699,7 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-                if BUILD_EORA:
+                if False and BUILD_EORA:
                     extensions += [
                         cpp_ext.CUDAExtension(
                             "gptqmodel_exllama_eora",
@@ -711,7 +711,7 @@ def _hipify_compile_flags(flags):
                             extra_compile_args=extra_compile_args,
                         )
                     ]
-                if BUILD_EXLLAMA_V2:
+                if False and BUILD_EXLLAMA_V2:
                     extensions += [
                         cpp_ext.CUDAExtension(
                             "gptqmodel_exllamav2_kernels",
@@ -726,7 +726,7 @@ def _hipify_compile_flags(flags):
                     ]
 
             # both CUDA and ROCm compatible
-            if BUILD_EXLLAMA_V1:
+            if True:
                 extensions += [
                     cpp_ext.CUDAExtension(
                         "gptqmodel_exllama_kernels",
@@ -742,7 +742,7 @@ def _hipify_compile_flags(flags):
                     )
                 ]
 
-            if BUILD_AWQ:
+            if False and BUILD_AWQ:
                 if ROCM_VERSION:
                     print("Skipping AWQ kernels on ROCm: inline PTX is CUDA-only.")
                 else:
@@ -786,13 +786,13 @@ def _hipify_compile_flags(flags):
 
         additional_setup_kwargs = {
             "ext_modules": extensions,
-            "include_package_data": True,
-            "package_data": {"": ["build/lib/*.so"]},
+            # "include_package_data": True,
+            # "package_data": {"": ["build/lib/*.so"]},
             "cmdclass": {"build_ext": cpp_ext.BuildExtension.with_options(
                 use_ninja=True,
                 no_python_abi_suffix=True,
                 build_temp="build/temp",
-                build_lib="build/lib",
+                # build_lib="build/lib", TODO FIX ME why package_data doesn't work..
                 clean_first=False  # keep intermediates for reuse
             )},
         }

From f4ba974de514c21b864ba838d2f07ad3cff21ea6 Mon Sep 17 00:00:00 2001
From: CSY-ModelCloud <csy@modelcloud.ai>
Date: Wed, 22 Oct 2025 16:48:41 +0800
Subject: [PATCH 3/3] remove test codes

---
 setup.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index aefb3cd17..9a83f75fc 100644
--- a/setup.py
+++ b/setup.py
@@ -614,7 +614,7 @@ def _hipify_compile_flags(flags):
         # Extensions (gate marlin/qqq/eora/exllamav2 on CUDA sm_80+ and non-ROCm)
         if sys.platform != "win32":
             if not ROCM_VERSION and HAS_CUDA_V8:
-                if False and BUILD_MARLIN:
+                if BUILD_MARLIN:
                     marlin_kernel_dir = Path("gptqmodel_ext/marlin")
                     marlin_kernel_files = sorted(marlin_kernel_dir.glob("kernel_*.cu"))
 
@@ -645,7 +645,7 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-                if False and BUILD_MACHETE and HAS_CUDA_V9 and _version_geq(NVCC_VERSION, 12, 0):
+                if BUILD_MACHETE and HAS_CUDA_V9 and _version_geq(NVCC_VERSION, 12, 0):
                     try:
                         result = subprocess.run(
                             [sys.executable, "gptqmodel_ext/machete/generate.py"],
@@ -686,7 +686,7 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-                if False and BUILD_QQQ:
+                if BUILD_QQQ:
                     extensions += [
                         cpp_ext.CUDAExtension(
                             "gptqmodel_qqq_kernels",
@@ -699,7 +699,7 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-                if False and BUILD_EORA:
+                if BUILD_EORA:
                     extensions += [
                         cpp_ext.CUDAExtension(
                             "gptqmodel_exllama_eora",
@@ -711,7 +711,7 @@ def _hipify_compile_flags(flags):
                             extra_compile_args=extra_compile_args,
                         )
                     ]
-                if False and BUILD_EXLLAMA_V2:
+                if BUILD_EXLLAMA_V2:
                     extensions += [
                         cpp_ext.CUDAExtension(
                             "gptqmodel_exllamav2_kernels",
@@ -726,7 +726,7 @@ def _hipify_compile_flags(flags):
                     ]
 
             # both CUDA and ROCm compatible
-            if True:
+            if BUILD_EXLLAMA_V1:
                 extensions += [
                     cpp_ext.CUDAExtension(
                         "gptqmodel_exllama_kernels",
@@ -742,7 +742,7 @@ def _hipify_compile_flags(flags):
                     )
                 ]
 
-            if False and BUILD_AWQ:
+            if BUILD_AWQ:
                 if ROCM_VERSION:
                     print("Skipping AWQ kernels on ROCm: inline PTX is CUDA-only.")
                 else: