diff --git a/cuda_core/cuda/core/_program.pxd b/cuda_core/cuda/core/_program.pxd index 02d436d3f3..7a6717059b 100644 --- a/cuda_core/cuda/core/_program.pxd +++ b/cuda_core/cuda/core/_program.pxd @@ -16,3 +16,5 @@ cdef class Program: object _compile_lock # Per-instance lock for compile-time mutation bint _use_libdevice # Flag for libdevice loading bint _libdevice_added + bytes _nvrtc_code # Source code for NVRTC retry (PCH auto-resize) + str _pch_status # PCH creation outcome after compile diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx index 68c0476b09..0b1fa93279 100644 --- a/cuda_core/cuda/core/_program.pyx +++ b/cuda_core/cuda/core/_program.pyx @@ -105,6 +105,32 @@ cdef class Program: """ return Program_compile(self, target_type, name_expressions, logs) + @property + def pch_status(self) -> str | None: + """PCH creation outcome from the most recent :meth:`compile` call. + + Possible values: + + * ``"created"`` — PCH file was written successfully. + * ``"not_attempted"`` — PCH creation was not attempted (e.g. the + compiler decided not to, or automatic PCH processing skipped it). + * ``"failed"`` — an error prevented PCH creation. + * ``None`` — PCH was not requested, the program has not been + compiled yet, the backend is not NVRTC (e.g. PTX or NVVM), + or the NVRTC bindings are too old to report status. + + When ``create_pch`` is set in :class:`ProgramOptions` and the PCH + heap is too small, :meth:`compile` automatically resizes the heap + and retries, so ``"created"`` should be the common outcome. + + .. note:: + + PCH is only supported for ``code_type="c++"`` programs that + use the NVRTC backend. For PTX and NVVM programs this property + always returns ``None``. + """ + return self._pch_status + @property def backend(self) -> str: """Return this Program instance's underlying backend.""" @@ -477,6 +503,8 @@ def _find_libdevice_path(): return find_bitcode_lib("device") + + cdef inline bint _process_define_macro_inner(list options, object macro) except? -1: """Process a single define macro, returning True if successful.""" if isinstance(macro, str): @@ -548,6 +576,8 @@ cdef inline int Program_init(Program self, object code, str code_type, object op self._use_libdevice = False self._libdevice_added = False + self._pch_status = None + if code_type == "c++": assert_type(code, str) if options.extra_sources is not None: @@ -562,6 +592,7 @@ cdef inline int Program_init(Program self, object code, str code_type, object op HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram( &nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL)) self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog) + self._nvrtc_code = code_bytes self._backend = "NVRTC" self._linker = None @@ -649,9 +680,15 @@ cdef inline int Program_init(Program self, object code, str code_type, object op return 0 -cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs): - """Compile using NVRTC backend and return ObjectCode.""" - cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc) +cdef object _nvrtc_compile_and_extract( + cynvrtc.nvrtcProgram prog, str target_type, object name_expressions, + object logs, list options_list, str name, +): + """Run nvrtcCompileProgram on *prog* and extract the output. + + This is the inner compile+extract loop, factored out so the PCH + auto-retry path can call it on a fresh program handle. + """ cdef size_t output_size = 0 cdef size_t logsize = 0 cdef vector[const char*] options_vec @@ -669,7 +706,6 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr)) # Build options array - options_list = self._options.as_bytes("nvrtc", target_type) options_vec.resize(len(options_list)) for i in range(len(options_list)): options_vec[i] = (options_list[i]) @@ -716,7 +752,84 @@ cdef object Program_compile_nvrtc(Program self, str target_type, object name_exp HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr)) logs.write(log.decode("utf-8", errors="backslashreplace")) - return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=self._options.name) + return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=name) + + +cdef int _nvrtc_pch_apis_cached = -1 # -1 = unchecked + +cdef bint _has_nvrtc_pch_apis(): + global _nvrtc_pch_apis_cached + if _nvrtc_pch_apis_cached < 0: + _nvrtc_pch_apis_cached = hasattr(nvrtc, "nvrtcGetPCHCreateStatus") + return _nvrtc_pch_apis_cached + + +cdef str _PCH_STATUS_CREATED = "created" +cdef str _PCH_STATUS_NOT_ATTEMPTED = "not_attempted" +cdef str _PCH_STATUS_FAILED = "failed" + + +cdef str _read_pch_status(cynvrtc.nvrtcProgram prog): + """Query nvrtcGetPCHCreateStatus and translate to a high-level string.""" + cdef cynvrtc.nvrtcResult err + with nogil: + err = cynvrtc.nvrtcGetPCHCreateStatus(prog) + if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS: + return _PCH_STATUS_CREATED + if err == cynvrtc.nvrtcResult.NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED: + return None # sentinel: caller should auto-retry + if err == cynvrtc.nvrtcResult.NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED: + return _PCH_STATUS_NOT_ATTEMPTED + return _PCH_STATUS_FAILED + + +cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs): + """Compile using NVRTC backend and return ObjectCode.""" + cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc) + cdef list options_list = self._options.as_bytes("nvrtc", target_type) + + result = _nvrtc_compile_and_extract( + prog, target_type, name_expressions, logs, options_list, self._options.name, + ) + + cdef bint pch_creation_possible = self._options.create_pch or self._options.pch + if not pch_creation_possible or not _has_nvrtc_pch_apis(): + self._pch_status = None + return result + + try: + status = _read_pch_status(prog) + except RuntimeError as e: + raise RuntimeError( + "PCH was requested but the runtime libnvrtc does not support " + "PCH APIs. Update to CUDA toolkit 12.8 or newer." + ) from e + + if status is not None: + self._pch_status = status + return result + + # Heap exhausted — auto-resize and retry with a fresh program + cdef size_t required = 0 + with nogil: + HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPCHHeapSizeRequired(prog, &required)) + HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required)) + + cdef cynvrtc.nvrtcProgram retry_prog + cdef const char* code_ptr = self._nvrtc_code + cdef const char* name_ptr = self._options._name + with nogil: + HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram( + &retry_prog, code_ptr, name_ptr, 0, NULL, NULL)) + self._h_nvrtc = create_nvrtc_program_handle(retry_prog) + + result = _nvrtc_compile_and_extract( + retry_prog, target_type, name_expressions, logs, options_list, self._options.name, + ) + + status = _read_pch_status(retry_prog) + self._pch_status = status if status is not None else _PCH_STATUS_FAILED + return result cdef object Program_compile_nvvm(Program self, str target_type, object logs): diff --git a/cuda_core/docs/source/release/0.6.0-notes.rst b/cuda_core/docs/source/release/0.6.0-notes.rst index b7d6188cc2..654eb7641b 100644 --- a/cuda_core/docs/source/release/0.6.0-notes.rst +++ b/cuda_core/docs/source/release/0.6.0-notes.rst @@ -54,6 +54,11 @@ New features - Added CUDA version compatibility check at import time to detect mismatches between ``cuda.core`` and the installed ``cuda-bindings`` version. +- ``Program.compile()`` now automatically resizes the NVRTC PCH heap and + retries when precompiled header creation fails due to heap exhaustion. + The ``pch_status`` property reports the PCH creation outcome + (``"created"``, ``"not_attempted"``, ``"failed"``, or ``None``). + Fixes and enhancements ---------------------- diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index edf249eb60..0005777b52 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -57,6 +57,22 @@ def _get_nvrtc_version_for_tests(): return None +def _has_nvrtc_pch_apis_for_tests(): + required = ( + "nvrtcGetPCHHeapSize", + "nvrtcSetPCHHeapSize", + "nvrtcGetPCHCreateStatus", + "nvrtcGetPCHHeapSizeRequired", + ) + return all(hasattr(nvrtc, name) for name in required) + + +nvrtc_pch_available = pytest.mark.skipif( + (_get_nvrtc_version_for_tests() or 0) < 12800 or not _has_nvrtc_pch_apis_for_tests(), + reason="PCH runtime APIs require NVRTC >= 12.8 bindings", +) + + _libnvvm_version = None _libnvvm_version_attempted = False @@ -316,6 +332,25 @@ def test_cpp_program_with_pch_options(init_cuda, tmp_path): program.close() +@nvrtc_pch_available +def test_cpp_program_pch_auto_creates(init_cuda, tmp_path): + code = 'extern "C" __global__ void my_kernel() {}' + pch_path = str(tmp_path / "test.pch") + program = Program(code, "c++", ProgramOptions(create_pch=pch_path)) + assert program.pch_status is None # not compiled yet + program.compile("ptx") + assert program.pch_status in ("created", "not_attempted", "failed") + program.close() + + +def test_cpp_program_pch_status_none_without_pch(init_cuda): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + program.compile("ptx") + assert program.pch_status is None + program.close() + + options = [ ProgramOptions(max_register_count=32), ProgramOptions(debug=True),