NVIDIA · leofang · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -52,7 +52,7 @@ body:
     attributes:
       label: Describe the bug
       description: A clear and concise description of what problem you are running into.
-      placeholder: "Attempting to compile a program via `cuda.core.experimental.Program.compile` throws a `ValueError`."
+      placeholder: "Attempting to compile a program via `cuda.core.Program.compile` throws a `ValueError`."
     validations:
       required: true
 
@@ -62,7 +62,7 @@ body:
       label: How to Reproduce
       description: Steps used to reproduce the bug.
       placeholder: |
-        0. Construct a `cuda.core.experimental.Program` instance
+        0. Construct a `cuda.core.Program` instance
         1. Call the `.compile(...)` method of the instance
         2. The call throws a `ValueError` with the following:
         ```
@@ -76,7 +76,7 @@ body:
     attributes:
       label: Expected behavior
       description: A clear and concise description of what you expected to happen.
-      placeholder: "Using `cuda.core.experimental.Program.compile(...)` should run successfully and not throw a `ValueError`"
+      placeholder: "Using `cuda.core.Program.compile(...)` should run successfully and not throw a `ValueError`"
     validations:
       required: true
 

diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -36,7 +36,7 @@ body:
     attributes:
       label: Is your feature request related to a problem? Please describe.
       description: A clear and concise description of what the problem is, e.g., "I would like to be able to..."
-      placeholder: I would like to be able to use the equivalent of `cuda.core.experimental.Program.compile(...)` to compile my code to PTX.
+      placeholder: I would like to be able to use the equivalent of `cuda.core.Program.compile(...)` to compile my code to PTX.
     validations:
       required: true
 
@@ -46,7 +46,7 @@ body:
       label: Describe the solution you'd like
       description: A clear and concise description of what you want to happen.
       placeholder: |
-        Support a `ptx` target_type in the `cuda.core.experimental.Program.compile(...)` function.
+        Support a `ptx` target_type in the `cuda.core.Program.compile(...)` function.
     validations:
       required: true
 
@@ -57,7 +57,7 @@ body:
       description:
         If applicable, please add a clear and concise description of any alternative solutions or features you've
         considered.
-      placeholder: The alternatives to using `cuda.core.experimental.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
+      placeholder: The alternatives to using `cuda.core.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
     validations:
       required: false
 

diff --git a/.spdx-ignore b/.spdx-ignore
@@ -9,6 +9,6 @@ requirements*.txt
 cuda_bindings/examples/*
 
 # Vendored
-cuda_core/cuda/core/experimental/include/dlpack.h
+cuda_core/cuda/core/_include/dlpack.h
 
 qa/ctk-next.drawio.svg
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
@@ -12,8 +12,8 @@
 
 In particular, each wheel contains a CUDA-specific build of the `cuda.core` library
 and the associated bindings. This script merges these directories into a single wheel
-that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
-and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
+that supports both CUDA versions, i.e., containing both `cuda/core/cu12`
+and `cuda/core/cu13`. At runtime, the code in `cuda/core/__init__.py`
 is used to import the appropriate CUDA-specific bindings.
 
 This script is based on the one in NVIDIA/CCCL.
@@ -25,6 +25,7 @@
 import subprocess
 import sys
 import tempfile
+import zipfile
 from pathlib import Path
 from typing import List
 
@@ -46,7 +47,38 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub
     return result
 
 
-def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
+def print_wheel_directory_structure(wheel_path: Path, filter_prefix: str = "cuda/core/", label: str = None):
+    """Print the directory structure of a wheel file, similar to unzip -l output.
+
+    Args:
+        wheel_path: Path to the wheel file to inspect
+        filter_prefix: Only show files matching this prefix (default: "cuda/core/")
+        label: Optional label to print before the structure (e.g., "Input wheel 1: name.whl")
+    """
+    if label:
+        print(f"\n--- {label} ---", file=sys.stderr)
+    try:
+        with zipfile.ZipFile(wheel_path, "r") as zf:
+            print(f"{'Length':>10}  {'Date':>12}  {'Time':>8}  Name", file=sys.stderr)
+            print("-" * 80, file=sys.stderr)
+            total_size = 0
+            file_count = 0
+            for name in sorted(zf.namelist()):
+                if filter_prefix in name:
+                    info = zf.getinfo(name)
+                    total_size += info.file_size
+                    file_count += 1
+                    date_time = info.date_time
+                    date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}"
+                    time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}"
+                    print(f"{info.file_size:10d}  {date_str}  {time_str}  {name}", file=sys.stderr)
+            print("-" * 80, file=sys.stderr)
+            print(f"{total_size:10d}                    {file_count} files", file=sys.stderr)
+    except Exception as e:
+        print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr)
+
+
+def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool = True) -> Path:
     """Merge multiple wheels into a single wheel with version-specific binaries."""
     print("\n=== Merging wheels ===", file=sys.stderr)
     print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr)
@@ -91,30 +123,50 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
             extracted_wheels.append(extract_dir)
 
+        if show_wheel_contents:
+            print("\n=== Input wheel directory structures ===", file=sys.stderr)
+            for i, wheel in enumerate(wheels):
+                print_wheel_directory_structure(wheel, label=f"Input wheel {i + 1}: {wheel.name}")
+
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # now copy the version-specific directory from other wheels
-        # into the appropriate place in the base wheel
+        # Copy version-specific directories from each wheel into versioned subdirectories
+        base_dir = Path("cuda") / "core"
+
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            base_dir = Path("cuda") / "core" / "experimental"
-            # Copy from other wheels
-            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
-            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
-
-            # Overwrite the __init__.py in versioned dirs
-            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
-
-        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
-        files_to_remove = os.scandir(base_wheel / base_dir)
-        for f in files_to_remove:
+            versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
+
+            # Copy entire directory tree from source wheel to versioned directory
+            print(f"  Copying {wheel_dir / base_dir} to {versioned_dir}", file=sys.stderr)
+            shutil.copytree(wheel_dir / base_dir, versioned_dir, dirs_exist_ok=True)
+
+            # Overwrite the __init__.py in versioned dirs to be empty
+            os.truncate(versioned_dir / "__init__.py", 0)
+
+        print("\n=== Removing files from cuda/core/ directory ===", file=sys.stderr)
+        items_to_keep = (
+            "__init__.py",
+            "_version.py",
+            "_include",
+            "cu12",
+            "cu13",
+        )
+        all_items = os.scandir(base_wheel / base_dir)
+        removed_count = 0
+        for f in all_items:
             f_abspath = f.path
-            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
-                if f.is_dir():
-                    shutil.rmtree(f_abspath)
-                else:
-                    os.remove(f_abspath)
+            if f.name in items_to_keep:
+                continue
+            if f.is_dir():
+                print(f"  Removing directory: {f.name}", file=sys.stderr)
+                shutil.rmtree(f_abspath)
+            else:
+                print(f"  Removing file: {f.name}", file=sys.stderr)
+                os.remove(f_abspath)
+            removed_count += 1
+        print(f"Removed {removed_count} items from cuda/core/ directory", file=sys.stderr)
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)
@@ -142,6 +194,11 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
 
         merged_wheel = output_wheels[0]
         print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr)
+
+        if show_wheel_contents:
+            print("\n=== Output wheel directory structure ===", file=sys.stderr)
+            print_wheel_directory_structure(merged_wheel)
+
         return merged_wheel
 
 

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
@@ -68,7 +68,7 @@ def _build_cuda_core():
 
     # It seems setuptools' wildcard support has problems for namespace packages,
     # so we explicitly spell out all Extension instances.
-    root_module = "cuda.core.experimental"
+    root_module = "cuda.core"
     root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
     ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
 
@@ -86,6 +86,7 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    all_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
     extra_compile_args = []
     if COMPILE_FOR_COVERAGE:
         # CYTHON_TRACE_NOGIL indicates to trace nogil functions.  It is not
@@ -94,9 +95,9 @@ def get_cuda_paths():
 
     ext_modules = tuple(
         Extension(
-            f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            f"cuda.core.{mod.replace(os.path.sep, '.')}",
+            sources=[f"cuda/core/{mod}.pyx"],
+            include_dirs=all_include_dirs,
             language="c++",
             extra_compile_args=extra_compile_args,
         )

diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
@@ -3,3 +3,67 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.core._version import __version__
+
+try:
+    from cuda import bindings
+except ImportError:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
+else:
+    cuda_major, cuda_minor = bindings.__version__.split(".")[:2]
+    if cuda_major not in ("12", "13"):
+        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+
+import importlib
+
+subdir = f"cu{cuda_major}"
+try:
+    versioned_mod = importlib.import_module(f".{subdir}", __package__)
+    # Import all symbols from the module
+    globals().update(versioned_mod.__dict__)
+except ImportError:
+    # This is not a wheel build, but a conda or local build, do nothing
+    pass
+else:
+    del versioned_mod
+finally:
+    del bindings, importlib, subdir, cuda_major, cuda_minor
+
+from cuda.core import utils  # noqa: E402
+from cuda.core._device import Device  # noqa: E402
+from cuda.core._event import Event, EventOptions  # noqa: E402
+from cuda.core._graph import (  # noqa: E402
+    Graph,
+    GraphBuilder,
+    GraphCompleteOptions,
+    GraphDebugPrintOptions,
+)
+from cuda.core._launch_config import LaunchConfig  # noqa: E402
+from cuda.core._launcher import launch  # noqa: E402
+from cuda.core._layout import _StridedLayout  # noqa: E402
+from cuda.core._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core._memory import (  # noqa: E402
+    Buffer,
+    DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
+    GraphMemoryResource,
+    LegacyPinnedMemoryResource,
+    ManagedMemoryResource,
+    ManagedMemoryResourceOptions,
+    MemoryResource,
+    PinnedMemoryResource,
+    PinnedMemoryResourceOptions,
+    VirtualMemoryResource,
+    VirtualMemoryResourceOptions,
+)
+from cuda.core._memoryview import (  # noqa: E402
+    StridedMemoryView,  # noqa: E402
+    args_viewable_as_strided_memory,  # noqa: E402
+)
+from cuda.core._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core._system import System  # noqa: E402
+
+system = System()
+__import__("sys").modules[__spec__.name + ".system"] = system
+del System
diff --git a/..._core/cuda/core/experimental/_context.pyx → cuda_core/cuda/core/_context.pyx b/..._core/cuda/core/experimental/_context.pyx → cuda_core/cuda/core/_context.pyx
@@ -4,7 +4,7 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver
 
 
 @dataclass

diff --git a/cuda_core/cuda/core/experimental/_device.pyx → cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx → cuda_core/cuda/core/_device.pyx
@@ -6,27 +6,27 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context, ContextOptions
+from cuda.core._event import Event, EventOptions
+from cuda.core._graph import GraphBuilder
+from cuda.core._stream import IsStreamT, Stream, StreamOptions
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
     driver,
     handle_return,
     runtime,
 )
-from cuda.core.experimental._stream cimport default_stream
+from cuda.core._stream cimport default_stream
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory import Buffer, MemoryResource
+    from cuda.core._memory import Buffer, MemoryResource
 
 # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
 # but it seems it is very convenient to expose them for testing purposes...
@@ -1034,7 +1034,7 @@ class Device:
         tuple of Device
             A tuple containing instances of available devices.
         """
-        from cuda.core.experimental import system
+        from cuda.core import system
         total = system.get_num_devices()
         return tuple(cls(device_id) for device_id in range(total))
 
@@ -1168,17 +1168,17 @@ class Device:
                     )
                 )
             if attr == 1:
-                from cuda.core.experimental._memory import DeviceMemoryResource
+                from cuda.core._memory import DeviceMemoryResource
                 self._memory_resource = DeviceMemoryResource(self._id)
             else:
-                from cuda.core.experimental._memory import _SynchronousMemoryResource
+                from cuda.core._memory import _SynchronousMemoryResource
                 self._memory_resource = _SynchronousMemoryResource(self._id)
 
         return self._memory_resource
 
     @memory_resource.setter
     def memory_resource(self, mr):
-        from cuda.core.experimental._memory import MemoryResource
+        from cuda.core._memory import MemoryResource
         assert_type(mr, MemoryResource)
         self._memory_resource = mr
 
@@ -1237,7 +1237,7 @@ class Device:
         Acts as an entry point of this object. Users always start a code by
         calling this method, e.g.
 
-        >>> from cuda.core.experimental import Device
+        >>> from cuda.core import Device
         >>> dev0 = Device(0)
         >>> dev0.set_current()
         >>> # ... do work on device 0 ...

diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd → cuda_core/cuda/core/_dlpack.pxd b/cuda_core/cuda/core/experimental/_dlpack.pxd → cuda_core/cuda/core/_dlpack.pxd
@@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t
 from libc.stdint cimport intptr_t
 
 
-cdef extern from "include/dlpack.h" nogil:
+cdef extern from "_include/dlpack.h" nogil:
     """
     #define DLPACK_TENSOR_UNUSED_NAME "dltensor"
     #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned"

diff --git a/cuda_core/cuda/core/experimental/_dlpack.pyx → cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/experimental/_dlpack.pyx → cuda_core/cuda/core/_dlpack.pyx
diff --git a/cuda_core/cuda/core/experimental/_event.pxd → cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/experimental/_event.pxd → cuda_core/cuda/core/_event.pxd