diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 114986dc3..7da1cc520 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -310,7 +310,8 @@ jobs:
 
       - name: Compile
         if: env.SHOULD_RUN == 1
-        run: python setup.py bdist_wheel
+        run: |
+          python setup.py bdist_wheel
 
       - name: Test install
         if: env.SHOULD_RUN == 1
@@ -395,7 +396,8 @@ jobs:
           git checkout pr-${PR_NUMBER}
 
       - name: Compile
-        run: python -m build --no-isolation --sdist
+        run: |
+          python -m build --no-isolation --sdist
 
       - name: Check dist
         run: |
@@ -478,7 +480,7 @@ jobs:
           git config --global --add safe.directory $(pwd)
           git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
           git checkout pr-${PR_NUMBER}
-          
+
       - uses: actions/setup-python@v6
         with:
           python-version: 3.13
@@ -488,8 +490,11 @@ jobs:
         run: |
           pip install build setuptools uv -U
           uv pip install torch twine ninja --system
+
       - name: Compile
-        run: python -m build --no-isolation --sdist
+        run: |
+          python -m build --no-isolation --sdist
+
 
       - name: Check dist
         run: |
@@ -497,6 +502,7 @@ jobs:
           whl=$(ls -t dist/*.gz | head -n 1 | xargs basename)
           echo "WHL_NAME=$whl" >> $GITHUB_ENV
           twine check dist/$whl
+
       - name: Upload to artifact
         uses: actions/upload-artifact@v4
         with:
@@ -516,6 +522,7 @@ jobs:
         if: (github.event_name == 'release' || github.event.inputs.upload_pypi == 'true') && !cancelled()
         run: |
           for i in {1..5}; do sleep 5; done
+
       - name: Upload sdist to pypi
         if: (github.event_name == 'release' || github.event.inputs.upload_pypi == 'true') && !cancelled()
         env:
diff --git a/gptqmodel_ext/machete/generate.py b/gptqmodel_ext/machete/generate.py
index bea0bddda..52bd805de 100644
--- a/gptqmodel_ext/machete/generate.py
+++ b/gptqmodel_ext/machete/generate.py
@@ -26,6 +26,8 @@
 
 _CUTLASS_PYTHON_DIR = _CUTLASS_ROOT / "python"
 
+_CUTLASS_PYTHON_DIR.mkdir(parents=True, exist_ok=True)
+
 if str(_CUTLASS_EXT_DIR) not in sys.path:
     sys.path.append(str(_CUTLASS_EXT_DIR))
 if _CUTLASS_PYTHON_DIR.exists() and str(_CUTLASS_PYTHON_DIR) not in sys.path:
diff --git a/setup.py b/setup.py
index 1234b3bd2..f9a5027e3 100644
--- a/setup.py
+++ b/setup.py
@@ -615,33 +615,20 @@ def _hipify_compile_flags(flags):
                     ]
 
                 if BUILD_MACHETE and HAS_CUDA_V9 and _version_geq(NVCC_VERSION, 12, 0):
-                    machete_dir = Path("gptqmodel_ext/machete")
-                    machete_generated_dir = machete_dir / "generated"
-
-                    machete_sources = [str(machete_dir / "machete_pytorch.cu")]
-                    machete_generated_sources = sorted(machete_generated_dir.glob("*.cu"))
-
-                    if not machete_generated_sources:
-                        raise RuntimeError(
-                            "No generated machete kernel templates detected. Run gptqmodel_ext/machete/generate.py"
-                            " with CUTLASS checkout before building."
+                    try:
+                        result = subprocess.run(
+                            [sys.executable, "gptqmodel_ext/machete/generate.py"],
+                            check=True,
+                            text=True,
+                            capture_output=True
                         )
-
-                    machete_sources += [str(path) for path in machete_generated_sources]
-
-                    machete_include_dirs = [str(Path("gptqmodel_ext").resolve())] + [str(path) for path in cutlass_include_paths]
-
-                    extensions += [
-                        cpp_ext.CUDAExtension(
-                            "gptqmodel_machete_kernels",
-                            machete_sources,
-                            extra_link_args=extra_link_args,
-                            extra_compile_args=extra_compile_args,
-                            include_dirs=machete_include_dirs,
+                    except subprocess.CalledProcessError as e:
+                        raise RuntimeError(
+                            f"Error generating machete kernel templates:\n"
+                            f"Return code: {e.returncode}\n"
+                            f"Stderr: {e.stderr}\n"
+                            f"Stdout: {e.stdout}"
                         )
-                    ]
-
-                if BUILD_MACHETE and HAS_CUDA_V9 and _version_geq(NVCC_VERSION, 12, 0):
                     machete_dir = Path("gptqmodel_ext/machete")
                     machete_generated_dir = machete_dir / "generated"