Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cuda_core/cuda/core/_program.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ cdef class Program:
object _compile_lock # Per-instance lock for compile-time mutation
bint _use_libdevice # Flag for libdevice loading
bint _libdevice_added
bytes _nvrtc_code # Source code for NVRTC retry (PCH auto-resize)
str _pch_status # PCH creation outcome after compile
123 changes: 118 additions & 5 deletions cuda_core/cuda/core/_program.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,32 @@ cdef class Program:
"""
return Program_compile(self, target_type, name_expressions, logs)

@property
def pch_status(self) -> str | None:
"""PCH creation outcome from the most recent :meth:`compile` call.

Possible values:

* ``"created"`` — PCH file was written successfully.
* ``"not_attempted"`` — PCH creation was not attempted (e.g. the
compiler decided not to, or automatic PCH processing skipped it).
* ``"failed"`` — an error prevented PCH creation.
* ``None`` — PCH was not requested, the program has not been
compiled yet, the backend is not NVRTC (e.g. PTX or NVVM),
or the NVRTC bindings are too old to report status.

When ``create_pch`` is set in :class:`ProgramOptions` and the PCH
heap is too small, :meth:`compile` automatically resizes the heap
and retries, so ``"created"`` should be the common outcome.

.. note::

PCH is only supported for ``code_type="c++"`` programs that
use the NVRTC backend. For PTX and NVVM programs this property
always returns ``None``.
"""
return self._pch_status

@property
def backend(self) -> str:
"""Return this Program instance's underlying backend."""
Expand Down Expand Up @@ -477,6 +503,8 @@ def _find_libdevice_path():
return find_bitcode_lib("device")




cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
"""Process a single define macro, returning True if successful."""
if isinstance(macro, str):
Expand Down Expand Up @@ -548,6 +576,8 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
self._use_libdevice = False
self._libdevice_added = False

self._pch_status = None

if code_type == "c++":
assert_type(code, str)
if options.extra_sources is not None:
Expand All @@ -562,6 +592,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
self._nvrtc_code = code_bytes
self._backend = "NVRTC"
self._linker = None

Expand Down Expand Up @@ -649,9 +680,15 @@ cdef inline int Program_init(Program self, object code, str code_type, object op
return 0


cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
"""Compile using NVRTC backend and return ObjectCode."""
cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
cdef object _nvrtc_compile_and_extract(
cynvrtc.nvrtcProgram prog, str target_type, object name_expressions,
object logs, list options_list, str name,
):
"""Run nvrtcCompileProgram on *prog* and extract the output.

This is the inner compile+extract loop, factored out so the PCH
auto-retry path can call it on a fresh program handle.
"""
cdef size_t output_size = 0
cdef size_t logsize = 0
cdef vector[const char*] options_vec
Expand All @@ -669,7 +706,6 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr))

# Build options array
options_list = self._options.as_bytes("nvrtc", target_type)
options_vec.resize(len(options_list))
for i in range(len(options_list)):
options_vec[i] = <const char*>(<bytes>options_list[i])
Expand Down Expand Up @@ -716,7 +752,84 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr))
logs.write(log.decode("utf-8", errors="backslashreplace"))

return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name)
return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=name)


cdef int _nvrtc_pch_apis_cached = -1 # -1 = unchecked

cdef bint _has_nvrtc_pch_apis():
global _nvrtc_pch_apis_cached
if _nvrtc_pch_apis_cached < 0:
_nvrtc_pch_apis_cached = hasattr(nvrtc, "nvrtcGetPCHCreateStatus")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this is the right approach in hindsight. Someone could have cuda.bindings v12.9.5 that was built against a sufficiently new toolkit and then run it in an environment with an older libnvrtc.so, in which case I think this attribute exists on the nvrtc module, but returns a RuntimeError from failing to find the symbol at runtime.

Maybe we need to catch that potential RuntimeError somewhere and present something gracefully to the user?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The internal _inspect_function_pointers autogenerated by both codegens would serve this need. It offers the source of truth (if the function exists and can be loaded).

return _nvrtc_pch_apis_cached


cdef str _PCH_STATUS_CREATED = "created"
cdef str _PCH_STATUS_NOT_ATTEMPTED = "not_attempted"
cdef str _PCH_STATUS_FAILED = "failed"


cdef str _read_pch_status(cynvrtc.nvrtcProgram prog):
"""Query nvrtcGetPCHCreateStatus and translate to a high-level string."""
cdef cynvrtc.nvrtcResult err
with nogil:
err = cynvrtc.nvrtcGetPCHCreateStatus(prog)
if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS:
return _PCH_STATUS_CREATED
if err == cynvrtc.nvrtcResult.NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED:
return None # sentinel: caller should auto-retry
Comment thread
kkraus14 marked this conversation as resolved.
if err == cynvrtc.nvrtcResult.NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED:
return _PCH_STATUS_NOT_ATTEMPTED
return _PCH_STATUS_FAILED
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this return refers to NVRTC_ERROR_PCH_CREATE



cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
"""Compile using NVRTC backend and return ObjectCode."""
cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
cdef list options_list = self._options.as_bytes("nvrtc", target_type)

result = _nvrtc_compile_and_extract(
prog, target_type, name_expressions, logs, options_list, self._options.name,
)

cdef bint pch_creation_possible = self._options.create_pch or self._options.pch
if not pch_creation_possible or not _has_nvrtc_pch_apis():
self._pch_status = None
return result

try:
status = _read_pch_status(prog)
except RuntimeError as e:
raise RuntimeError(
"PCH was requested but the runtime libnvrtc does not support "
"PCH APIs. Update to CUDA toolkit 12.8 or newer."
) from e
Comment thread
kkraus14 marked this conversation as resolved.

if status is not None:
self._pch_status = status
return result

# Heap exhausted — auto-resize and retry with a fresh program
cdef size_t required = 0
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPCHHeapSizeRequired(prog, &required))
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required))

cdef cynvrtc.nvrtcProgram retry_prog
cdef const char* code_ptr = <const char*>self._nvrtc_code
cdef const char* name_ptr = <const char*>self._options._name
with nogil:
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&retry_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(retry_prog)

result = _nvrtc_compile_and_extract(
retry_prog, target_type, name_expressions, logs, options_list, self._options.name,
)

status = _read_pch_status(retry_prog)
self._pch_status = status if status is not None else _PCH_STATUS_FAILED
return result


cdef object Program_compile_nvvm(Program self, str target_type, object logs):
Expand Down
5 changes: 5 additions & 0 deletions cuda_core/docs/source/release/0.6.0-notes.rst
Comment thread
leofang marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ New features
- Added CUDA version compatibility check at import time to detect mismatches between
``cuda.core`` and the installed ``cuda-bindings`` version.

- ``Program.compile()`` now automatically resizes the NVRTC PCH heap and
retries when precompiled header creation fails due to heap exhaustion.
The ``pch_status`` property reports the PCH creation outcome
(``"created"``, ``"not_attempted"``, ``"failed"``, or ``None``).


Fixes and enhancements
----------------------
Expand Down
35 changes: 35 additions & 0 deletions cuda_core/tests/test_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,22 @@ def _get_nvrtc_version_for_tests():
return None


def _has_nvrtc_pch_apis_for_tests():
required = (
"nvrtcGetPCHHeapSize",
"nvrtcSetPCHHeapSize",
"nvrtcGetPCHCreateStatus",
"nvrtcGetPCHHeapSizeRequired",
)
return all(hasattr(nvrtc, name) for name in required)


nvrtc_pch_available = pytest.mark.skipif(
(_get_nvrtc_version_for_tests() or 0) < 12800 or not _has_nvrtc_pch_apis_for_tests(),
reason="PCH runtime APIs require NVRTC >= 12.8 bindings",
)


_libnvvm_version = None
_libnvvm_version_attempted = False

Expand Down Expand Up @@ -316,6 +332,25 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path):
program.close()


@nvrtc_pch_available
def test_cpp_program_pch_auto_creates(init_cuda, tmp_path):
code = 'extern "C" __global__ void my_kernel() {}'
pch_path = str(tmp_path / "test.pch")
program = Program(code, "c++", ProgramOptions(create_pch=pch_path))
assert program.pch_status is None # not compiled yet
program.compile("ptx")
assert program.pch_status in ("created", "not_attempted", "failed")
program.close()


def test_cpp_program_pch_status_none_without_pch(init_cuda):
code = 'extern "C" __global__ void my_kernel() {}'
program = Program(code, "c++")
program.compile("ptx")
assert program.pch_status is None
program.close()


options = [
ProgramOptions(max_register_count=32),
ProgramOptions(debug=True),
Expand Down
Loading