Skip to content
53 changes: 48 additions & 5 deletions cuda_core/cuda/core/experimental/_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import weakref
from contextlib import contextmanager
from dataclasses import dataclass
from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING, List, Tuple, Union
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is safe to use normal Python list/tuple in typing now

from warnings import warn

if TYPE_CHECKING:
Expand Down Expand Up @@ -298,6 +298,10 @@ class ProgramOptions:
split_compile: int | None = None
fdevice_syntax_only: bool | None = None
minimal: bool | None = None
# Creating as 2 tuples ((names, source), (names,source))
extra_sources: (
Union[List[Tuple[str, Union[str, bytes, bytearray]]], Tuple[Tuple[str, Union[str, bytes, bytearray]]]] | None
) = None
numba_debug: bool | None = None # Custom option for Numba debugging

def __post_init__(self):
Expand Down Expand Up @@ -419,8 +423,6 @@ def __post_init__(self):
self._formatted_options.append("--fdevice-syntax-only")
if self.minimal is not None and self.minimal:
self._formatted_options.append("--minimal")
if self.numba_debug:
self._formatted_options.append("--numba-debug")
Comment on lines -422 to -423
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw plz do not remove this and the related tests, however ugly this is needed for future devtool works 😅


def _as_bytes(self):
# TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
Expand Down Expand Up @@ -470,26 +472,33 @@ def close(self):
nvvm.destroy_program(self.handle)
self.handle = None

__slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
__slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options", "_module_count")

def __init__(self, code, code_type, options: ProgramOptions = None):
self._mnff = Program._MembersNeededForFinalize(self, None, None)

self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
code_type = code_type.lower()
self._module_count = 0

if code_type == "c++":
assert_type(code, str)
# TODO: support pre-loaded headers & include names
# TODO: allow tuples once NVIDIA/cuda-python#72 is resolved

if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")

# TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), options._name, 0, [], []))
self._mnff.backend = "NVRTC"
self._backend = "NVRTC"
self._linker = None

elif code_type == "ptx":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the PTX backend.")

self._linker = Linker(
ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
)
Expand All @@ -505,6 +514,40 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
self._mnff.handle = nvvm.create_program()
self._mnff.backend = "NVVM"
nvvm.add_module_to_program(self._mnff.handle, code, len(code), options._name.decode())
self._module_count = 1
# Add extra modules if provided
if options.extra_sources is not None:
if not is_sequence(options.extra_sources):
raise TypeError(
"extra_modules must be a sequence of 2-tuples:((name1, source1), (name2, source2), ...)"
)
for i, module in enumerate(options.extra_sources):
if not isinstance(module, tuple) or len(module) != 2:
raise TypeError(
f"Each extra module must be a 2-tuple (name, source)"
f", got {type(module).__name__} at index {i}"
)

module_name, module_source = module

if not isinstance(module_name, str):
raise TypeError(f"Module name at index {i} must be a string,got {type(module_name).__name__}")

if isinstance(module_source, str):
# Textual LLVM IR - encode to UTF-8 bytes
module_source = module_source.encode("utf-8")
elif not isinstance(module_source, (bytes, bytearray)):
raise TypeError(
f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
f"or bytearray, got {type(module_source).__name__}"
)

if len(module_source) == 0:
raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")

nvvm.add_module_to_program(self._mnff.handle, module_source, len(module_source), module_name)
self._module_count += 1

self._backend = "NVVM"
self._linker = None

Expand Down
220 changes: 195 additions & 25 deletions cuda_core/tests/test_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@
cuda_driver_version = handle_return(driver.cuDriverGetVersion())
is_culink_backend = _linker._decide_nvjitlink_or_driver()

try:
from cuda_python_test_helpers.nvvm_bitcode import minimal_nvvmir

_test_helpers_available = True
except ImportError:
_test_helpers_available = False


def _is_nvvm_available():
"""Check if NVVM is available."""
Expand All @@ -31,29 +38,12 @@ def _is_nvvm_available():
)

try:
from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
from cuda.core.experimental._utils.cuda_utils import driver, handle_return

_cuda_driver_version = handle_return(driver.cuDriverGetVersion())
except Exception:
_cuda_driver_version = 0


def _get_nvrtc_version_for_tests():
"""
Get NVRTC version.

Returns:
int: Version in format major * 1000 + minor * 100 (e.g., 13200 for CUDA 13.2)
None: If NVRTC is not available
"""
try:
nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
version = nvrtc_major * 1000 + nvrtc_minor * 100
return version
except Exception:
return None


_libnvvm_version = None
_libnvvm_version_attempted = False

Expand Down Expand Up @@ -193,13 +183,6 @@ def ptx_code_object():
[
ProgramOptions(name="abc"),
ProgramOptions(device_code_optimize=True, debug=True),
pytest.param(
ProgramOptions(debug=True, numba_debug=True),
marks=pytest.mark.skipif(
(_get_nvrtc_version_for_tests() or 0) < 13200,
reason="numba_debug requires NVRTC >= 13.2",
),
),
ProgramOptions(relocatable_device_code=True, max_register_count=32),
ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
ProgramOptions(fma=False, use_fast_math=True),
Expand Down Expand Up @@ -411,3 +394,190 @@ def test_nvvm_program_options(init_cuda, nvvm_ir, options):
assert ".visible .entry simple(" in ptx_text

program.close()


@nvvm_available
@pytest.mark.parametrize(
"options",
[
ProgramOptions(name="ltoir_test1", arch="sm_90", device_code_optimize=False),
ProgramOptions(name="ltoir_test2", arch="sm_100", link_time_optimization=True),
ProgramOptions(
name="ltoir_test3",
arch="sm_90",
ftz=True,
prec_sqrt=False,
prec_div=False,
fma=True,
device_code_optimize=True,
link_time_optimization=True,
),
],
)
def test_nvvm_program_options_ltoir(init_cuda, nvvm_ir, options):
"""Test NVVM programs for LTOIR with different options"""
program = Program(nvvm_ir, "nvvm", options)
assert program.backend == "NVVM"

ltoir_code = program.compile("ltoir")
assert isinstance(ltoir_code, ObjectCode)
assert ltoir_code.name == options.name
code_content = ltoir_code.code
assert len(code_content) > 0
program.close()


@nvvm_available
def test_nvvm_program_with_single_extra_source(nvvm_ir):
"""Test NVVM program with a single extra source"""
from cuda.core.experimental._program import _get_nvvm_module

nvvm = _get_nvvm_module()
major, minor, debug_major, debug_minor = nvvm.ir_version()
# helper nvvm ir for multiple module loading
helper_nvvmir = f"""target triple = "nvptx64-unknown-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

define i32 @helper_add(i32 %x) {{
entry:
%result = add i32 %x, 1
ret i32 %result
}}

!nvvmir.version = !{{!0}}
!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
""" # noqa: E501

options = ProgramOptions(
name="multi_module_test",
extra_sources=[
("helper", helper_nvvmir),
],
)
program = Program(nvvm_ir, "nvvm", options)

assert program.backend == "NVVM"

ptx_code = program.compile("ptx")
assert isinstance(ptx_code, ObjectCode)
assert ptx_code.name == "multi_module_test"

program.close()


@nvvm_available
def test_nvvm_program_with_multiple_extra_sources():
"""Test NVVM program with multiple extra sources"""
from cuda.core.experimental._program import _get_nvvm_module

nvvm = _get_nvvm_module()
major, minor, debug_major, debug_minor = nvvm.ir_version()

main_nvvm_ir = f"""target triple = "nvptx64-unknown-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

declare i32 @helper_add(i32) nounwind readnone
declare i32 @helper_mul(i32) nounwind readnone

define void @main_kernel(i32* %data) {{
entry:
%tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ptr = getelementptr inbounds i32, i32* %data, i32 %tid
%val = load i32, i32* %ptr, align 4

%val1 = call i32 @helper_add(i32 %val)
%val2 = call i32 @helper_mul(i32 %val1)

store i32 %val2, i32* %ptr, align 4
ret void
}}

declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone

!nvvm.annotations = !{{!0}}
!0 = !{{void (i32*)* @main_kernel, !"kernel", i32 1}}

!nvvmir.version = !{{!1}}
!1 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
""" # noqa: E501

helper1_ir = f"""target triple = "nvptx64-unknown-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

define i32 @helper_add(i32 %x) nounwind readnone {{
entry:
%result = add i32 %x, 1
ret i32 %result
}}

!nvvmir.version = !{{!0}}
!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
""" # noqa: E501

helper2_ir = f"""target triple = "nvptx64-unknown-cuda"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"

define i32 @helper_mul(i32 %x) nounwind readnone {{
entry:
%result = mul i32 %x, 2
ret i32 %result
}}

!nvvmir.version = !{{!0}}
!0 = !{{i32 {major}, i32 {minor}, i32 {debug_major}, i32 {debug_minor}}}
""" # noqa: E501

options = ProgramOptions(
name="nvvm_multi_helper_test",
extra_sources=[
("helper1", helper1_ir),
("helper2", helper2_ir),
],
)
program = Program(main_nvvm_ir, "nvvm", options)

assert program.backend == "NVVM"
ptx_code = program.compile("ptx")
assert isinstance(ptx_code, ObjectCode)
assert ptx_code.name == "nvvm_multi_helper_test"

ltoir_code = program.compile("ltoir")
assert isinstance(ltoir_code, ObjectCode)
assert ltoir_code.name == "nvvm_multi_helper_test"

program.close()


@nvvm_available
@pytest.mark.skipif(not _test_helpers_available, reason="cuda_python_test_helpers not accessible")
def test_bitcode_format(minimal_nvvmir):
if len(minimal_nvvmir) < 4:
pytest.skip("Bitcode file is not valid or empty")

options = ProgramOptions(name="minimal_nvvmir_bitcode_test", arch="sm_90")
program = Program(minimal_nvvmir, "nvvm", options)

assert program.backend == "NVVM"
ptx_result = program.compile("ptx")
assert isinstance(ptx_result, ObjectCode)
assert ptx_result.name == "minimal_nvvmir_bitcode_test"
assert len(ptx_result.code) > 0
program_lto = Program(minimal_nvvmir, "nvvm", options)
try:
ltoir_result = program_lto.compile("ltoir")
assert isinstance(ltoir_result, ObjectCode)
assert len(ltoir_result.code) > 0
print(f"LTOIR size: {len(ltoir_result.code)} bytes")
except Exception as e:
print(f"LTOIR compilation failed : {e}")
finally:
program.close()


def test_cpp_program_with_extra_sources():
# negative test with NVRTC with multiple sources
code = 'extern "C" __global__ void my_kernel(){}'
helper = 'extern "C" __global__ void helper(){}'
options = ProgramOptions(extra_sources=helper)
with pytest.raises(ValueError, match="extra_sources is not supported by the NVRTC backend"):
Program(code, "c++", options)
Loading